import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from scipy.optimize import minimize
import sklearn.linear_model as lm

# plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3
sns.set()


df = pd.read_csv('nba.csv')


df.head()


df['WL']

0       L
1       W
2       L
3       W
4       L
       ..
2455    W
2456    W
2457    L
2458    W
2459    L
Name: WL, Length: 2460, dtype: object


df["WON"] = df["WL"]
df["WON"] = df["WON"].replace("W", 1)
df["WON"] = df["WON"].replace("L", 0)
df.head(5)


df['FG_PCT']

0       0.538
1       0.485
2       0.409
3       0.458
4       0.435
        ...  
2455    0.472
2456    0.553
2457    0.484
2458    0.591
2459    0.402
Name: FG_PCT, Length: 2460, dtype: float64


one_team = df.groupby("GAME_ID").first()
opponent = df.groupby("GAME_ID").last()
games = one_team.merge(opponent, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games["FG_PCT_DIFF"] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = games['WL'].replace('L', 0).replace('W', 1)
games = games[['TEAM_NAME', 'MATCHUP', 'WON', 'FG_PCT_DIFF']]


games.head()


sns.jointplot(data = games, x = "FG_PCT_DIFF", y = "WON");


sns.jointplot(data = games, x = "FG_PCT_DIFF", y = "WON", 
              y_jitter = 0.1, 
              kind="reg", 
              ci=False,
              joint_kws={'line_kws':{'color':'green'}});


games2 = games.copy()


games2.iloc[0] = ['hello', 'hello', 1, 120]


sns.jointplot(data = games2, x = "FG_PCT_DIFF", y = "WON", 
              y_jitter = 0.1, 
              kind="reg", 
              ci=False,
              joint_kws={'line_kws':{'color':'green'}});


bins = pd.cut(games["FG_PCT_DIFF"], 20)


bins

GAME_ID
21700001    (-0.0648, -0.0382]
21700002      (0.0416, 0.0682]
21700003    (-0.0382, -0.0116]
21700004       (0.015, 0.0416]
21700005      (0.0416, 0.0682]
                   ...        
21701226        (0.175, 0.201]
21701227      (0.0682, 0.0948]
21701228       (0.015, 0.0416]
21701229    (-0.0914, -0.0648]
21701230     (-0.118, -0.0914]
Name: FG_PCT_DIFF, Length: 1230, dtype: category
Categories (20, interval[float64]): [(-0.252, -0.224] < (-0.224, -0.198] < (-0.198, -0.171] < (-0.171, -0.145] ... (0.175, 0.201] < (0.201, 0.228] < (0.228, 0.254] < (0.254, 0.281]]


games["bin"] = [(b.left + b.right) / 2 for b in bins]
games["bin"]

GAME_ID
21700001   -0.0515
21700002    0.0549
21700003   -0.0249
21700004    0.0283
21700005    0.0549
             ...  
21701226    0.1880
21701227    0.0815
21701228    0.0283
21701229   -0.0781
21701230   -0.1047
Name: bin, Length: 1230, dtype: float64


games


win_rates_by_bin = games.groupby("bin")["WON"].mean()
win_rates_by_bin

bin
-0.2380    0.000000
-0.2110    0.000000
-0.1845    0.000000
-0.1580    0.000000
-0.1315    0.000000
-0.1047    0.033898
-0.0781    0.083333
-0.0515    0.148438
-0.0249    0.363636
 0.0017    0.505747
 0.0283    0.705128
 0.0549    0.792793
 0.0815    0.907407
 0.1079    0.984615
 0.1345    1.000000
 0.1615    1.000000
 0.1880    1.000000
 0.2410    1.000000
 0.2675    1.000000
Name: WON, dtype: float64


plt.plot(win_rates_by_bin, 'r')

[<matplotlib.lines.Line2D at 0x7fc69533fd30>]


sns.jointplot(data = games, x = "FG_PCT_DIFF", y = "WON", 
              y_jitter = 0.1, 
              kind="reg", 
              ci=False,
              joint_kws={'line_kws':{'color':'green'}});
plt.plot(win_rates_by_bin, 'r', linewidth = 5);


def sigma(t):
    return 1 / (1 + np.exp(-t))


plt.plot(win_rates_by_bin, 'r', linewidth = 5);
x = win_rates_by_bin.index
plt.plot(x, sigma(x * 30), 'black', linewidth = 5);
plt.xlabel('FG_PCT_DIFF')
plt.ylabel('WON');


win_rates_by_bin

bin
-0.2380    0.000000
-0.2110    0.000000
-0.1845    0.000000
-0.1580    0.000000
-0.1315    0.000000
-0.1047    0.033898
-0.0781    0.083333
-0.0515    0.148438
-0.0249    0.363636
 0.0017    0.505747
 0.0283    0.705128
 0.0549    0.792793
 0.0815    0.907407
 0.1079    0.984615
 0.1345    1.000000
 0.1615    1.000000
 0.1880    1.000000
 0.2410    1.000000
 0.2675    1.000000
Name: WON, dtype: float64


odds_by_bin = win_rates_by_bin / (1 - win_rates_by_bin)
odds_by_bin

bin
-0.2380     0.000000
-0.2110     0.000000
-0.1845     0.000000
-0.1580     0.000000
-0.1315     0.000000
-0.1047     0.035088
-0.0781     0.090909
-0.0515     0.174312
-0.0249     0.571429
 0.0017     1.023256
 0.0283     2.391304
 0.0549     3.826087
 0.0815     9.800000
 0.1079    64.000000
 0.1345          inf
 0.1615          inf
 0.1880          inf
 0.2410          inf
 0.2675          inf
Name: WON, dtype: float64


plt.plot(odds_by_bin);


plt.plot(np.log(odds_by_bin));

/opt/conda/lib/python3.8/site-packages/pandas/core/arraylike.py:274: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)


x = np.linspace(-5,5,50)
plt.plot(x, sigma(x));
plt.xlabel('x')
plt.ylabel(r'$\frac{1}{1 + e^{-x}}$');


def flatten(li): 
    return [item for sub in li for item in sub]

bs = [-2, -1, -0.5, 2, 1, 0.5]
xs = np.linspace(-10, 10, 100)

fig, axes = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(10, 6))
for ax, b in zip(flatten(axes), bs):
    ys = sigma(xs * b)
    ax.plot(xs, ys)
    ax.set_title(r'$ \theta_1 = $' + str(b))

# add a big axes, hide frame
fig.add_subplot(111, frameon=False)
# hide tick and tick label of the big axes
plt.tick_params(labelcolor='none', top=False, bottom=False,
                left=False, right=False)
plt.grid(False)
plt.xlabel('$x$')
plt.ylabel(r'$ \frac{1}{1+\exp(-\theta_1 \cdot x)} $')
plt.tight_layout()
plt.savefig('sigmoids.png')


fig = go.Figure()
for theta1 in [-1,1, 5]:
    for theta0 in [-2, 0, 2]:
        fig.add_trace(go.Scatter(name=f"{theta0} + {theta1} x", x=xs, y=sigma(theta0 + theta1*xs)))
fig


def mse_loss_single_arg_nba(theta):
    x = games["FG_PCT_DIFF"]
    y_obs = games["WON"]
    y_hat = sigma(x * theta)
    return np.mean((y_hat - y_obs) ** 2)


thetas = np.linspace(-50, 50, 100)
plt.plot(thetas, [mse_loss_single_arg_nba(theta) for theta in thetas])
plt.ylabel('MSE')
plt.xlabel(r'$\theta$');


minimize(mse_loss_single_arg_nba, x0 = 0)

      fun: 0.14105789850974032
 hess_inv: array([[12719.30255595]])
      jac: array([-8.78795981e-06])
  message: 'Optimization terminated successfully.'
     nfev: 24
      nit: 7
     njev: 12
   status: 0
  success: True
        x: array([29.13007801])


plt.plot(win_rates_by_bin, 'r', linewidth = 5);
x = win_rates_by_bin.index
plt.plot(x, sigma(x * 29.13), 'black', linewidth = 5);
plt.xlabel('FG_PCT_DIFF')
plt.ylabel('WON');


rand_x = np.array([[-0.04185564],
       [ 0.12799961],
       [-0.09528101],
       [-0.0058139 ],
       [ 0.0870956 ]])

rand_y = np.array([[0],
       [0],
       [1],
       [0],
       [1]])


plt.plot(rand_x, rand_y, 'b*')
plt.xlabel('x')
plt.ylabel('y');


def mse_loss_single_arg_toy(theta):
    x = rand_x
    y_obs = rand_y
    y_hat = sigma(x * theta)
    return np.mean((y_obs - y_hat)**2)


mse_loss_single_arg_toy(10)

0.3226572801334151


thetas = np.linspace(-1000, 1000, 100)
plt.plot(thetas, [mse_loss_single_arg_toy(theta) for theta in thetas])
plt.ylabel(r'MSE($\theta$)')
plt.xlabel(r'$\theta$');


best_theta = minimize(mse_loss_single_arg_toy, x0 = 0)["x"][0]
best_theta

-4.801981341432673


plt.plot(rand_x, rand_y, 'b*')
xs = np.linspace(-1, 1, 100)
plt.plot(xs, sigma(xs * best_theta), color='orange')
plt.xlabel('x')
plt.legend(['$y$', '$\hat{y}$']);


best_theta_2 = minimize(mse_loss_single_arg_toy, x0 = 500)["x"][0]
best_theta_2

500.0


plt.plot(rand_x, rand_y, 'b*')
xs = np.linspace(min(rand_x), max(rand_x), 100)
plt.plot(xs, sigma(xs * best_theta_2), color='orange')
plt.xlabel('x')
plt.legend(['$y$', '$\hat{y}$']);


y_hat = np.arange(0.001, 0.999, 0.01)
loss = (1 - y_hat)**2
plt.plot(y_hat, loss, color='k')
plt.xlabel('$\hat{y}$: Predicted Chance of Correct Class')
plt.ylabel('$(1 - \hat{y})^2$')
plt.title('Squared Loss for One Individual');


y_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(y_hat)
plt.plot(y_hat, loss, color='k')
plt.xlabel('$\hat{y}$: Predicted Chance of Correct Class')
plt.ylabel('$-\log(\hat{y})$')
plt.title('Log Loss for one observation when $y = 1$');


y_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(1 - y_hat)
plt.plot(y_hat, loss, color='k')
plt.xlabel('$\hat{y}$: Predicted Chance of Correct Class')
plt.ylabel('$-\log(1 - \hat{y})$')
plt.title('Log Loss for one observation when $y = 0$');


def cross_entropy(y, yhat):
    return - y * np.log(yhat) - (1 - y) * np.log(1 - yhat)


def mce_loss_single_arg_toy(theta):
    x = rand_x
    y_obs = rand_y
    y_hat = sigma(x * theta)
    return np.mean(cross_entropy(y_obs, y_hat))


thetas = np.linspace(-1000, 1000, 100)
plt.plot(thetas, [mce_loss_single_arg_toy(theta) for theta in thetas], color = 'green')
plt.ylabel(r'Mean Cross-Entropy($\theta$)')
plt.xlabel(r'$\theta$');

/tmp/ipykernel_72/529625363.py:2: RuntimeWarning:

divide by zero encountered in log

/tmp/ipykernel_72/529625363.py:2: RuntimeWarning:

invalid value encountered in multiply


best_theta_mce = minimize(mce_loss_single_arg_toy, x0 = 0)["x"][0]
best_theta_mce

-5.213601516313596


best_theta

-4.801981341432673


def mce_loss_single_arg_nba(theta):
    x = games["FG_PCT_DIFF"]
    y_obs = games["WON"]
    y_hat = sigma(theta * x)
    return np.mean(cross_entropy(y_obs, y_hat))


best_theta_mce_nba = minimize(mce_loss_single_arg_nba, x0 = 0)["x"][0]
best_theta_mce_nba

30.57879021073147


minimize(mse_loss_single_arg_nba, x0 = 0)["x"][0]

29.130078012616192


best_theta_mce_nba = minimize(mce_loss_single_arg_nba, x0 = 0)["x"][0]
best_theta_mce_nba

30.57879021073147


def predict_probabilities(X, theta):
    return sigma(X * theta)


predict_probabilities(games['FG_PCT_DIFF'], best_theta_mce_nba)

GAME_ID
21700001    0.182670
21700002    0.834888
21700003    0.285495
21700004    0.777945
21700005    0.783182
              ...   
21701226    0.996919
21701227    0.891865
21701228    0.627110
21701229    0.059969
21701230    0.048979
Name: FG_PCT_DIFF, Length: 1230, dtype: float64


model = lm.LogisticRegression(penalty = 'none', fit_intercept = False, solver = 'lbfgs')


model.fit(games[['FG_PCT_DIFF']], games['WON'])

LogisticRegression(fit_intercept=False, penalty='none')


model.coef_

array([[30.57950163]])


best_theta_mce_nba

30.57879021073147


model.predict_proba([[0.1]])

array([[0.04487548, 0.95512452]])


model.predict_proba(games[['FG_PCT_DIFF']])

array([[0.81733506, 0.18266494],
       [0.16510648, 0.83489352],
       [0.71450899, 0.28549101],
       ...,
       [0.37288695, 0.62711305],
       [0.94003495, 0.05996505],
       [0.95102413, 0.04897587]])


model.predict_proba(games[['FG_PCT_DIFF']])[:, 1]

array([0.18266494, 0.83489352, 0.28549101, ..., 0.62711305, 0.05996505,
       0.04897587])


predict_probabilities(games['FG_PCT_DIFF'], best_theta_mce_nba)

GAME_ID
21700001    0.182670
21700002    0.834888
21700003    0.285495
21700004    0.777945
21700005    0.783182
              ...   
21701226    0.996919
21701227    0.891865
21701228    0.627110
21701229    0.059969
21701230    0.048979
Name: FG_PCT_DIFF, Length: 1230, dtype: float64


model.predict(games[['FG_PCT_DIFF']])

array([0, 1, 0, ..., 1, 0, 0])

	SEASON_ID	TEAM_ID	TEAM_ABBREVIATION	TEAM_NAME	GAME_ID	GAME_DATE	MATCHUP	WL	MIN	FGM	...	DREB	REB	AST	STL	BLK	TOV	PF	PTS	PLUS_MINUS	VIDEO_AVAILABLE
0	22017	1610612744	GSW	Golden State Warriors	21700002	2017-10-17	GSW vs. HOU	L	240	43	...	35	41	34	5	9	17	25	121	-1	1
1	22017	1610612745	HOU	Houston Rockets	21700002	2017-10-17	HOU @ GSW	W	240	47	...	33	43	28	9	5	13	16	122	1	1
2	22017	1610612738	BOS	Boston Celtics	21700001	2017-10-17	BOS @ CLE	L	240	36	...	37	46	24	11	4	12	24	99	-3	1
3	22017	1610612739	CLE	Cleveland Cavaliers	21700001	2017-10-17	CLE vs. BOS	W	240	38	...	41	50	19	3	4	17	25	102	3	1
4	22017	1610612750	MIN	Minnesota Timberwolves	21700011	2017-10-18	MIN @ SAS	L	240	37	...	31	42	23	7	4	13	16	99	-8	1

	SEASON_ID	TEAM_ID	TEAM_ABBREVIATION	TEAM_NAME	GAME_ID	GAME_DATE	MATCHUP	WL	MIN	FGM	...	REB	AST	STL	BLK	TOV	PF	PTS	PLUS_MINUS	VIDEO_AVAILABLE	WON
0	22017	1610612744	GSW	Golden State Warriors	21700002	2017-10-17	GSW vs. HOU	L	240	43	...	41	34	5	9	17	25	121	-1	1	0
1	22017	1610612745	HOU	Houston Rockets	21700002	2017-10-17	HOU @ GSW	W	240	47	...	43	28	9	5	13	16	122	1	1	1
2	22017	1610612738	BOS	Boston Celtics	21700001	2017-10-17	BOS @ CLE	L	240	36	...	46	24	11	4	12	24	99	-3	1	0
3	22017	1610612739	CLE	Cleveland Cavaliers	21700001	2017-10-17	CLE vs. BOS	W	240	38	...	50	19	3	4	17	25	102	3	1	1
4	22017	1610612750	MIN	Minnesota Timberwolves	21700011	2017-10-18	MIN @ SAS	L	240	37	...	42	23	7	4	13	16	99	-8	1	0

	TEAM_NAME	MATCHUP	WON	FG_PCT_DIFF
GAME_ID
21700001	Boston Celtics	BOS @ CLE	0	-0.049
21700002	Golden State Warriors	GSW vs. HOU	0	0.053
21700003	Charlotte Hornets	CHA @ DET	0	-0.030
21700004	Indiana Pacers	IND vs. BKN	1	0.041
21700005	Orlando Magic	ORL vs. MIA	1	0.042

Lecture 19 – Data 100, Summer 2021¶

Motivating Logistic Regression¶

The Logistic Function¶

Logistic Regression with Squared Loss¶

Motivating Cross-Entropy Loss¶

Predicting Probabilities¶

Making Classifications¶