import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from scipy.optimize import minimize
import sklearn.linear_model as lm

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3
sns.set()


df = pd.read_csv('nba.csv')
df["WON"] = df["WL"]
df["WON"] = df["WON"].replace("W", 1)
df["WON"] = df["WON"].replace("L", 0)
one_team = df.groupby("GAME_ID").first()
opponent = df.groupby("GAME_ID").last()
games = one_team.merge(opponent, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games["FG_PCT_DIFF"] = games["FG_PCT"] - games["FG_PCT_OPP"]
games["PF_DIFF"] = games["PF"] - games["PF_OPP"]
games['WON'] = games['WL'].replace('L', 0).replace('W', 1)
games = games[['TEAM_NAME', 'MATCHUP', 'WON', 'FG_PCT_DIFF', 'PF_DIFF']]


games.head()


basic_model = lm.LogisticRegression(penalty = 'none', fit_intercept = False, solver = 'lbfgs')


basic_model.fit(games[['FG_PCT_DIFF']], games['WON'])

LogisticRegression(fit_intercept=False, penalty='none')


basic_model.predict_proba(games[['FG_PCT_DIFF']])

array([[0.81733506, 0.18266494],
       [0.16510648, 0.83489352],
       [0.71450899, 0.28549101],
       ...,
       [0.37288695, 0.62711305],
       [0.94003495, 0.05996505],
       [0.95102413, 0.04897587]])


x_sorted = np.array(games['FG_PCT_DIFF'].sort_values()).reshape(len(games), 1)
basic_model_ps_sorted = basic_model.predict_proba(x_sorted)[:, 1]

points = go.Scatter(name = 'true observations', 
                   x = games['FG_PCT_DIFF'],
                   y = games['WON'],
                   mode = 'markers',
                   marker={'opacity':0.5})

lr_line = go.Scatter(name = 'Logistic Regression model', 
                    x = x_sorted.flatten(),
                    y = basic_model_ps_sorted)

fig = go.Figure([points, lr_line])
fig


def predict_at_threshold(prob, threshold):
    return np.where(prob >= threshold, 1, 0)


predict_at_threshold(np.array([0.2, 0.5, 0.8]), 0.4)

array([0, 1, 1])


ts = [0.25, 0.5, 0.75]

thresholds_df = pd.DataFrame()
basic_model_ps = basic_model.predict_proba(games[['FG_PCT_DIFF']])[:, 1]
thresholds_df['P(Y = 1 | x)'] = basic_model_ps

for t in ts:
    thresholds_df[f'T = {t}'] = predict_at_threshold(basic_model_ps, t)


thresholds_df


plt.figure(figsize = (16, 4))
for i, t in enumerate(ts):
    plt.subplot(1, 3, i + 1)
    plt.xlim(-0.2, 0.2)
    plt.plot(x_sorted, basic_model_ps_sorted);
    plt.plot([-0.3, 0.3], [t, t], color = 'r')
    plt.title(f'T = {t}')
    plt.xlabel('FG_PCT_DIFF')
    plt.ylabel(r'$P(Y = 1 | x)$')


better_model = lm.LogisticRegression(penalty = 'none', fit_intercept = True, solver = 'lbfgs')


better_model.fit(games[['FG_PCT_DIFF', 'PF_DIFF']], games['WON'])

LogisticRegression(penalty='none')


better_model.coef_

array([[34.70520847, -0.15994445]])


better_model.intercept_

array([0.03455392])


games['FG_PCT_DIFF'].describe()

count    1230.000000
mean       -0.001329
std         0.075473
min        -0.251000
25%        -0.050000
50%        -0.002000
75%         0.048000
max         0.281000
Name: FG_PCT_DIFF, dtype: float64


games['PF_DIFF'].describe()

count    1230.000000
mean        0.301626
std         5.160492
min       -20.000000
25%        -3.000000
50%         0.000000
75%         4.000000
max        17.000000
Name: PF_DIFF, dtype: float64


num_points = 50 # increase for better resolution, but it will run more slowly. 

# if (num_points <= 100):

uvalues = np.linspace(-0.2, 0.2, num_points)
vvalues = np.linspace(-20, 20, num_points)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))

probs = [better_model.predict_proba(t.reshape(1, 2))[0, 1] for t in thetas.T]

probs_surface = go.Surface(x=u, y=v, z=np.reshape(probs, u.shape))

# opt_point = go.Scatter3d(x = [ahat], y = [bhat], z = [mse_for_height_model((ahat, bhat))],
#             mode = 'markers', name = 'optimal parameters',
#             marker=dict(size=10, color='gold'))

fig = go.Figure(data=[probs_surface])
# fig.add_trace(opt_point)

fig.update_layout(scene = dict(
    xaxis_title = "FG_PCT_DIFF",
    yaxis_title = "PF_DIFF",
    zaxis_title = "P(Y = 1 | x)"))


fig.show()


num_points = 50 # increase for better resolution, but it will run more slowly. 

# if (num_points <= 100):

uvalues = np.linspace(-0.2, 0.2, num_points)
vvalues = np.linspace(-20, 20, num_points)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))

probs = [better_model.predict_proba(t.reshape(1, 2))[0, 1] for t in thetas.T]

probs_surface = go.Surface(x=u, y=v, z=np.reshape(probs, u.shape))

threshold_surface = go.Surface(x = u, y = v, z = 0.3 * np.ones_like(u), opacity = 0.8, name = 'T = 0.3')

fig = go.Figure(data=[probs_surface, threshold_surface])

fig.update_layout(scene = dict(
    xaxis_title = "FG_PCT_DIFF",
    yaxis_title = "PF_DIFF",
    zaxis_title = "P(Y = 1 | x)"))

fig.show()


# Getting our predicted 1s and 0s
y_pred = better_model.predict(games[['FG_PCT_DIFF', 'PF_DIFF']])

# The proportion of times we classified correctly is the accuracy
better_model_accuracy = np.mean(y_pred == games['WON'])

better_model_accuracy

0.8170731707317073


better_model.score(games[['FG_PCT_DIFF', 'PF_DIFF']], games['WON'])

0.8170731707317073


from sklearn.metrics import confusion_matrix

# Be careful – confusion_matrix takes in y_true as the first parameter and y_pred as the second.
# Don't mix these up!
cm = confusion_matrix(games['WON'], y_pred)
cm

array([[521, 104],
       [121, 484]])


cm = confusion_matrix(games['WON'], y_pred)
sns.heatmap(cm, annot=True, fmt = 'd', cmap = 'Blues', annot_kws = {'size': 16})
plt.xlabel('Predicted')
plt.ylabel('Actual');


tp = np.sum((y_pred == 1) & (games['WON'] == 1))
tn = np.sum((y_pred == 0) & (games['WON'] == 0))
fp = np.sum((y_pred == 1) & (games['WON'] == 0))
fn = np.sum((y_pred == 0) & (games['WON'] == 1))


tp, tn, fp, fn

(484, 521, 104, 121)


precision = tp / (tp + fp)
precision

0.8231292517006803


recall = tp / (tp + fn)
recall

0.8


def accuracy_custom_threshold(model, X, y, threshold):
    predicted_probabilities = model.predict_proba(X)[:, 1]
    y_pred = predict_at_threshold(predicted_probabilities, threshold)
    return np.mean(y_pred == y)


accuracy_custom_threshold(better_model, games[['FG_PCT_DIFF', 'PF_DIFF']], games['WON'], 0.5)

0.8170731707317073


thresholds = np.linspace(0, 1, 100)
accs = [accuracy_custom_threshold(better_model,
                                  games[['FG_PCT_DIFF', 'PF_DIFF']], 
                                  games['WON'],
                                  t) for t in thresholds]


fig = px.line(x=thresholds, y=accs)
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Accuracy")


def precision_custom_threshold(model, X, y, threshold):
    predicted_probabilities = model.predict_proba(X)[:, 1]
    y_pred = predict_at_threshold(predicted_probabilities, threshold)
    tp = np.sum((y_pred == 1) & (y == 1))
    fp = np.sum((y_pred == 1) & (y == 0))
    return tp / (tp + fp)

precision_custom_threshold(better_model, games[['FG_PCT_DIFF', 'PF_DIFF']], games['WON'], 0.5)

0.8231292517006803


def recall_custom_threshold(model, X, y, threshold):
    predicted_probabilities = model.predict_proba(X)[:, 1]
    y_pred = predict_at_threshold(predicted_probabilities, threshold)
    tp = np.sum((y_pred == 1) & (y == 1))
    fn = np.sum((y_pred == 0) & (y == 1))
    return tp / (tp + fn)

recall_custom_threshold(better_model, games[['FG_PCT_DIFF', 'PF_DIFF']], games['WON'], 0.5)

0.8


precs = [precision_custom_threshold(better_model,
                                  games[['FG_PCT_DIFF', 'PF_DIFF']], 
                                  games['WON'],
                                  t) for t in thresholds]

recs = [recall_custom_threshold(better_model,
                                  games[['FG_PCT_DIFF', 'PF_DIFF']], 
                                  games['WON'],
                                  t) for t in thresholds]

/tmp/ipykernel_12353/566441887.py:6: RuntimeWarning:

invalid value encountered in long_scalars


fig = go.Figure()
fig.add_trace(go.Scatter(name = 'Precision', x = thresholds, y = precs))
fig.add_trace(go.Scatter(name = 'Recall', x = thresholds, y = recs))
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Proportion")


fig = px.line(x=recs, y=precs, hover_name=thresholds)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")


from sklearn.metrics import precision_recall_curve


precision, recall, threshold = precision_recall_curve(games['WON'], 
                                                              better_model.predict_proba(games[['FG_PCT_DIFF', 'PF_DIFF']])[:, 1])


fig = px.line(x=recall[:-1], y = precision[:-1], hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig


from sklearn.metrics import roc_curve


fpr, tpr, threshold = roc_curve(games['WON'], 
                               better_model.predict_proba(games[['FG_PCT_DIFF', 'PF_DIFF']])[:, 1])


fig = px.line(x=fpr, y = tpr, hover_name=threshold)
fig.update_xaxes(title="False Positive Rate")
fig.update_yaxes(title="True Positive Rate")
fig


fpr, tpr, threshold = roc_curve(games['WON'], np.random.uniform(0, 1, len(games['WON'])))


fig = px.line(x=fpr, y = tpr, hover_name=threshold)
fig.update_xaxes(title="False Positive Rate")
fig.update_yaxes(title="True Positive Rate")
fig


from matplotlib.colors import ListedColormap


plt.figure(figsize = (8, 5))
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:2, :])
xx, yy = np.meshgrid(np.linspace(-0.4, 0.4, 2000), np.linspace(-20, 20, 2000))
Z_string = predict_at_threshold(
    better_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1],
    0.3)
categories, Z_int = np.unique(Z_string, return_inverse = True)
Z_int = Z_int.reshape(xx.shape)
plt.contourf(xx, yy, Z_int, cmap = sns_cmap)
sns.scatterplot(data = games, x = 'FG_PCT_DIFF', y = 'PF_DIFF', hue = 'WON')
plt.title('Decision Boundary for NBA Model');
# plt.savefig('withoutpts.png')


better_model.coef_

array([[34.70520847, -0.15994445]])


better_model.intercept_

array([0.03455392])


def f(x):
    return (1/0.160) * (0.035 + 34.705*x - np.log(0.3 / 0.7))


plt.figure(figsize = (8, 5))
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:2, :])
xx, yy = np.meshgrid(np.linspace(-0.4, 0.4, 2000), np.linspace(-20, 20, 2000))
Z_string = predict_at_threshold(
    better_model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1],
    0.3)
categories, Z_int = np.unique(Z_string, return_inverse = True)
Z_int = Z_int.reshape(xx.shape)
plt.contourf(xx, yy, Z_int, cmap = sns_cmap)
# sns.scatterplot(data = games, x = 'FG_PCT_DIFF', y = 'PF_DIFF', hue = 'WON')
plt.title('Decision Boundary for NBA Model');

xs = np.linspace(-0.4, 0.4, 50)
ys = f(xs)
plt.plot(xs, ys, color = 'black', linewidth=5)
plt.ylim(-20, 20);


X_toy = np.array([-1, 1])
y_toy = np.array([1, 0])


plt.scatter(X_toy.flatten(), y_toy);


sigma = lambda t: 1 / (1 + np.e**(-t))

def cross_entropy(y, yhat):
    return - y * np.log(yhat) - (1 - y) * np.log(1 - yhat)


def mce_loss_single_arg_toy(theta):
    y_hat = sigma(X_toy * theta)
    return np.mean(cross_entropy(y_toy, y_hat))


thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [mce_loss_single_arg_toy(theta) for theta in thetas], color = 'green')
plt.ylabel(r'Mean Cross-Entropy($\theta$)')
plt.xlabel(r'$\theta$');


def mce_regularized_loss_single_arg_toy(theta, reg):
    return mce_loss_single_arg_toy(theta) + reg * theta**2


thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [mce_regularized_loss_single_arg_toy(theta, 0.1) for theta in thetas], color = 'green')
plt.ylabel(r'Mean Cross-Entropy($\theta$)')
plt.xlabel(r'$\theta$');


iris = sns.load_dataset('iris')


iris


sns.scatterplot(data = iris[iris['species'] != 'virginica'],
               x = 'petal_length',
               y = 'petal_width',
               hue = 'species');


sns.scatterplot(data = iris[iris['species'] != 'setosa'],
               x = 'petal_length',
               y = 'petal_width',
               hue = 'species');


iris['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)


iris['is_versicolor'] = (iris['species'] == 'versicolor').astype(int)


iris


cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']


from sklearn.model_selection import train_test_split


iris_train, iris_test = train_test_split(iris, test_size = 0.2)


iris_model_no_reg = lm.LogisticRegression(penalty = 'none', solver = 'lbfgs')
iris_model_no_reg.fit(iris_train[cols], iris_train['is_versicolor'])

LogisticRegression(penalty='none')


iris_model_no_reg.coef_

array([[-0.15087866, -3.12769719,  0.87624732, -2.01472444]])


iris_model_no_reg.intercept_

array([8.45480238])


iris_model_reg = lm.LogisticRegression(penalty = 'l2', solver = 'lbfgs')
iris_model_reg.fit(iris_train[cols], iris_train['is_versicolor'])

LogisticRegression()


iris_model_reg.coef_

array([[-0.25544254, -2.10908451,  0.53440202, -0.89711453]])


iris_model_reg.intercept_

array([6.03211878])


iris_model_no_reg.score(iris_train[cols], iris_train['is_versicolor'])

0.75


iris_model_reg.score(iris_train[cols], iris_train['is_versicolor'])

0.7666666666666667


iris_model_no_reg.score(iris_test[cols], iris_test['is_versicolor'])

0.7


iris_model_reg.score(iris_test[cols], iris_test['is_versicolor'])

0.6333333333333333

	TEAM_NAME	MATCHUP	WON	FG_PCT_DIFF	PF_DIFF
GAME_ID
21700001	Boston Celtics	BOS @ CLE	0	-0.049	-1
21700002	Golden State Warriors	GSW vs. HOU	0	0.053	9
21700003	Charlotte Hornets	CHA @ DET	0	-0.030	-6
21700004	Indiana Pacers	IND vs. BKN	1	0.041	0
21700005	Orlando Magic	ORL vs. MIA	1	0.042	-2

	P(Y = 1 \| x)	T = 0.25	T = 0.5	T = 0.75
0	0.182665	0	0	0
1	0.834894	1	1	1
2	0.285491	1	0	0
3	0.777950	1	1	1
4	0.783187	1	1	1
...	...	...	...	...
1225	0.996919	1	1	1
1226	0.891870	1	1	1
1227	0.627113	1	1	0
1228	0.059965	0	0	0
1229	0.048976	0	0	0

Lecture 20 – Data 100, Summer 2021¶

Thresholding in Logistic Regression¶

Evaluating classifiers¶

Confusion matrix¶

Precision and Recall¶

Accuracy vs. threshold, Precision vs. threshold, Recall vs. threshold¶

Precision-Recall Curves¶

Decision Boundaries¶

Linear Separability¶

Full Demo¶

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica