import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates["plotly"].layout.colorway = px.colors.qualitative.Vivid
px.defaults.width = 800

from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.linear_model as lm

basketball = pd.read_csv("data/nba.csv")

first_team = basketball.groupby("GAME_ID").first()
second_team = basketball.groupby("GAME_ID").last()
games = first_team.merge(second_team, left_index = True, right_index = True, suffixes = ["", "_OPP"])

games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = (games['WL'] == "W").astype(int)
games = games[['TEAM_NAME', 'TEAM_NAME_OPP', 'MATCHUP', 'WON', 'WL', 'AST', 'GOAL_DIFF']]

games

px.scatter(games, x="GOAL_DIFF", y="WON", color="WL", opacity=0.1)

X = games[["GOAL_DIFF"]]
Y = games["WON"]

model = lm.LogisticRegression()
model.fit(X, Y)

print("Slope:", model.coef_[0][0])
print("Intercept:", model.intercept_[0])

Slope: 11.821711344721841
Intercept: -0.022895093635768988

# Preview the first 10 rows
model.predict_proba(X)[:10]

array([[0.64615008, 0.35384992],
       [0.35350779, 0.64649221],
       [0.5932812 , 0.4067188 ],
       [0.38656007, 0.61343993],
       [0.38376056, 0.61623944],
       [0.59042552, 0.40957448],
       [0.70312773, 0.29687227],
       [0.63252173, 0.36747827],
       [0.31156713, 0.68843287],
       [0.61588544, 0.38411456]])

model.classes_

array([0, 1])

# Obtain P(Y=1|x) from the output.
p = model.predict_proba(X)[:, 1]

# Apply decision rule: predict Class 1 if P(Y=1|x) >= 0.5.
(p >= 0.5).astype(int)

array([0, 1, 0, ..., 1, 0, 0])

# .predict will automatically apply a 0.5 threshold for a logistic regression model.
classes = model.predict(X)

classes

array([0, 1, 0, ..., 1, 0, 0])

 -model.intercept_[0]/model.coef_[0][0]

0.0019366987543636136

games["Predicted Class"] = pd.Categorical(classes)

test_points = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
test_points["Predicted Prob"] = model.predict_proba(test_points)[:, 1]

fig = px.scatter(games, x="GOAL_DIFF", y="WON", color="Predicted Class", opacity=0.1)

# Add the logistic regression model predictions
fig.add_trace(go.Scatter(x=test_points["GOAL_DIFF"], y=test_points["Predicted Prob"], 
                         mode="lines", name="Logistic Regression Model", 
                         line_color="black", line_width=5, line_dash="dash"))
fig.add_vline(x = -model.intercept_[0]/model.coef_[0][0], line_dash="dash", 
              line_color="black",
              annotation_text="Decision Boundary", 
              annotation_position="right")

fig = px.scatter(games, x="GOAL_DIFF", y=np.zeros(len(games)),
                 symbol="WL", symbol_sequence=[ "circle-open", "cross"], 
                 color="Predicted Class", height=300, opacity=0.1)
# fig.update_traces(marker_symbol='line-ns-open')
fig.update_traces(marker_size=8)
fig.update_layout(
    yaxis=dict(showticklabels=False, showgrid=False, zeroline=False, title=""),
)

decision_boundary =  -model.intercept_[0]/model.coef_[0][0]
fig.add_vline(x = decision_boundary, line_dash="dash", 
              line_color="black",
              annotation_text="Decision Boundary", 
              annotation_position="top right")

X_two_feature = games[["GOAL_DIFF", "AST"]]
Y = games["WON"]

two_feature_model = lm.LogisticRegression()
two_feature_model.fit(X_two_feature, Y)

# This function plots the decision boundary such that AST is a function of GOAL_DIFF.
theta0 = two_feature_model.intercept_
theta1, theta2 = two_feature_model.coef_[0]
print(theta0, theta1, theta2)

[-2.1118332] 10.785521824889493 0.09027541671741883

games["Predicted Class"] = two_feature_model.predict(X_two_feature)
games.head()

# Construct the decision boundary
decision_boundary = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})

# Compute the y-values of the decision boundary (AST) using a grid of x-values (GOAL_DIFF).
# The decision boundary is defined by the equation:
# 0 = theta0 + theta1 * GOAL_DIFF + theta2 * AST
decision_boundary["AST"] = (theta0 + theta1*decision_boundary["GOAL_DIFF"])/(-theta2)

games['Predicted Class'] = pd.Categorical(games['Predicted Class'])
fig = px.scatter(games, x="GOAL_DIFF", y="AST", symbol="WL", 
                 hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'],
                 color="Predicted Class", 
                 symbol_sequence=[ "circle-open", "cross"],
                 opacity=0.7,
                 height=600)
fig.update_traces(marker=dict(size=8))
fig.update_layout(xaxis_range=[-0.3, 0.3], yaxis_range=[5, 50])
# Add the decision boundary to the plot
fig.add_scatter(x=decision_boundary["GOAL_DIFF"], y=decision_boundary["AST"],
                mode="lines", line_color="black", line_dash="dash", 
                name="Decision Boundary")

goal_diff, ast = np.meshgrid(np.linspace(-0.3, 0.3, 50), np.linspace(5, 50, 50))
pred_grid = pd.DataFrame({"GOAL_DIFF": np.ravel(goal_diff), "AST": np.ravel(ast)})
pred_grid['Probability'] = two_feature_model.predict_proba(pred_grid)[:, 1]
# fig = go.Figure()
fig.add_contour(x=pred_grid['GOAL_DIFF'], y=pred_grid['AST'], z=pred_grid['Probability'],
                showscale=False, opacity=0.4, colorscale="Matter")

import seaborn as sns
iris = sns.load_dataset("iris")

fig = px.scatter(iris[iris["species"] != "virginica"], 
                 x = "petal_length",
                 y = "petal_width", 
                 color="species", 
                 symbol="species", symbol_sequence=[ "circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))
fig

fig = px.scatter(iris[iris["species"] != "setosa"], 
                 x = "petal_length",
                 y = "petal_width", 
                 color="species", 
                 symbol="species", symbol_sequence=[ "circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))
fig

toy_df = pd.DataFrame({"x": [-1, 1], "y": [0, 1], "label": pd.Categorical([0, 1])})
fig = px.scatter(toy_df, x="x", y="y", 
                 color="label", symbol="label", 
                 symbol_sequence=[ "circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))

def toy_model(theta1, x):
    return 1/(1 + np.exp(-theta1 * x))

def mean_cross_entropy_loss_toy(theta1):
    # Here we use 1 - sigma(z) = sigma(-z) to improve numerical stability.
    return - np.sum(toy_df['y'] * np.log(toy_model(theta1, toy_df['x'])) + \
                    (1-toy_df['y']) * np.log(toy_model(theta1, -toy_df['x'])))

thetas = np.linspace(-30, 30, 100)
fig = px.line(x=thetas, y = [mean_cross_entropy_loss_toy(theta) for theta in thetas], 
              render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Mean CE Loss",
                  title="Mean Cross Entropy Loss for Toy Example")

fig = px.line(x=thetas, y = [mean_cross_entropy_loss_toy(theta) for theta in thetas],
              log_y=True, render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Log Scale Mean CE Loss",
                  title="Log Scale Mean Cross Entropy Loss for Toy Example")

def regularized_loss_toy(theta1, reg):
    return mean_cross_entropy_loss_toy(theta1) + reg * theta1**2

reg = 0.01 # Small amount of regularization
fig = px.line(x=thetas, y = [regularized_loss_toy(theta, reg) for theta in thetas],
              render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Mean CE Loss",
                  title=f"Mean Cross Entropy Loss for Toy Example (Regularization = {reg})")

# LogisticRegression objects behave a lot like LinearRegression objects.
# L2 regularization is applied by default, where lambda = 1 / C.
# Bigger C means less regularization.
toy_model = lm.LogisticRegression(C=10)

# We fit to two data points: (-1, 0) and (1, 1).
toy_model.fit([[-1], [1]], [0,1])

# Generate estimated probabilities across a range of x-values.
xtest = np.linspace(-1.5, 1.5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:,1]

fig = px.scatter(toy_df, x="x", y="y", 
         color="label", symbol="label", 
         symbol_sequence=["circle", "cross"],
         title=f"LR Fit (slope = {toy_model.coef_[0][0]}, intercept = {toy_model.intercept_[0]})",
         render_mode="svg")
fig.update_traces(marker=dict(size=15))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="LR Model with C=10", 
                line_color="black", opacity=0.5)

# Fit exactly the same model, but reduce the regularization strength by 
# a factor of 100.
toy_model = lm.LogisticRegression(C=1000)
toy_model.fit([[-1], [1]], [0,1])

xtest = np.linspace(-1.5, 1.5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:,1]

fig = px.scatter(toy_df, x="x", y="y", 
                 color="label", symbol="label", 
                 symbol_sequence=[ "circle", "cross"],
                 title=f"LR Fit (slope = {toy_model.coef_[0][0]}, intercept = {toy_model.intercept_[0]})",
                 render_mode="svg")
fig.update_traces(marker=dict(size=15))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="LR model with C=1000", 
                line_color="black", opacity=0.5)

def accuracy(X, Y):
    return np.mean(model.predict(X) == Y)

print(model.predict(X)[:5])
print(Y[:5])
accuracy(X, Y)

[0 1 0 1 1]
GAME_ID
21700001    0
21700002    0
21700003    0
21700004    1
21700005    1
Name: WON, dtype: int64

0.7943089430894309

model.score(X, Y)

0.7943089430894309

from sklearn.metrics import confusion_matrix

# Be careful – confusion_matrix takes in y_true as the first parameter and y_pred as the second.
# Don't mix these up!
cm = confusion_matrix(Y, model.predict(X))
cm

array([[511, 114],
       [139, 466]])

fig = px.imshow(cm, x=["0", "1"], y=["0", "1"],
          labels=dict(x="Predicted", y="Actual"), 
          text_auto=True, 
          color_continuous_scale="Blues", 
          width=400, height=400)
fig.update_xaxes(side="top")

Y_hat = model.predict(X)
tp = np.sum((Y_hat == 1) & (Y == 1))
tn = np.sum((Y_hat == 0) & (Y == 0))

fp = np.sum((Y_hat == 1) & (Y == 0))
fn = np.sum((Y_hat == 0) & (Y == 1))


print("True Positives: ", tp)
print("True Negatives: ", tn)
print("False Positives:", fp)
print("False Negatives:", fn)

True Positives:  466
True Negatives:  511
False Positives: 114
False Negatives: 139

precision = tp / (tp + fp)
precision

0.803448275862069

recall = tp / (tp + fn)
recall

0.7702479338842976

fpr = fp/(fp + tn)
fpr

0.1824

tpr = tp/(tp + fn)
tpr

0.7702479338842976

X = games[["GOAL_DIFF"]]
Y = games["WON"]
model = lm.LogisticRegression()
model.fit(X, Y)
print("Slope:", model.coef_[0][0])
print("Intercept:", model.intercept_[0])

Slope: 11.821711344721841
Intercept: -0.022895093635768988

def plot_predictions(threshold = 0.5):
    games["Predicted Class"] = model.predict_proba(X)[:, 1] >= threshold
    # Needed for plotting
    games["Predicted Class"] = pd.Categorical(games["Predicted Class"])
    fig = px.scatter(games, 
            x="GOAL_DIFF", y="WON", color="Predicted Class", 
            title=f"Logistic Regression Predictions (Threshold = {threshold})")
    # Add the logistic regression model predictions
    # Make the data points for the LR model curve
    test_points = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
    test_points["Predicted Prob"] = model.predict_proba(test_points)[:, 1]
    fig.add_trace(go.Scatter(x=test_points["GOAL_DIFF"], y=test_points["Predicted Prob"], 
                            mode="lines", name="Logistic Regression Model", 
                            line_color="black", line_width=5, line_dash="dash"))
    decision_boundary = (-np.log(1/threshold - 1) - model.intercept_[0])/model.coef_[0][0]
    fig.add_vline(x = decision_boundary, line_dash="dash", line_color="black",
                  annotation_text="Decision Boundary", annotation_position="right")
    return fig

plot_predictions(0.5)

plot_predictions(0.25)

plot_predictions(0.75)

# Define performance metrics dependent on the threshold value.
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def accuracy_threshold(X, Y, T):
    return np.mean(predict_threshold(model, X, T) == Y)

def precision_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    denominator = np.sum(Y_hat == 1)
    if denominator == 0:
        denominator = np.nan
    return np.sum((Y_hat == 1) & (Y == 1)) / denominator
    
def recall_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def tpr_threshold(X, Y, T): # Same as recall
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def fpr_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 0)) / np.sum(Y == 0)

metrics = pd.DataFrame()
metrics["Threshold"] = np.linspace(0, 1, 1000)
metrics["Accuracy"] = [accuracy_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["Precision"] = [precision_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["Recall"] = [recall_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics.head()

fig = px.line(metrics, 
        x="Threshold", y="Accuracy",
        title="Accuracy vs. Threshold",
        render_mode="svg", width=600, height=600)
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

# The threshold that maximizes accuracy.
metrics.sort_values("Accuracy", ascending=False).head()

fig = px.line(metrics, 
        x="Threshold", y=["Accuracy", "Precision", "Recall"],
        title="Performance Metrics vs. Threshold",
        render_mode="svg", height=600, width=600)
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="Recall", y="Precision",
        title="Precision vs. Recall",
        width=600, height=600,
        render_mode="svg")
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

metrics["F1"] = (2 * metrics["Precision"] * metrics["Recall"] 
                     / (metrics["Precision"] + metrics["Recall"]))
ind = metrics['F1'].idxmax()
metrics.loc[ind,:]

Threshold    0.482482
Accuracy     0.796748
Precision    0.784912
Recall       0.808264
F1           0.796417
Name: 482, dtype: float64

fig = px.line(metrics, x="Threshold", y="F1",
              title="Finding F1 Score Maximum",
              render_mode="svg",
              height=600, width=600)

fig.add_scatter(x=[metrics.loc[ind, 'Threshold']], y=[metrics.loc[ind, 'F1']], 
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}",)

fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="Recall", y="Precision",
              title="Precision vs. Recall", width=600, height=600,
              render_mode="svg")
fig.add_scatter(x=[metrics.loc[ind, 'Recall']], y=[metrics.loc[ind, 'Precision']], 
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}")
fig.update_layout(legend=dict(x=.5, y=.1))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

metrics["TPR"] = [tpr_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["FPR"] = [fpr_threshold(X, Y, t) for t in metrics["Threshold"]]

fig = px.line(metrics, x="Threshold", y=["TPR", "FPR", "Accuracy"],
        render_mode="svg", width=600, height=600)
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="FPR", y="TPR", title="ROC Curve", 
        width=600, height=600,
        render_mode="svg")
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

fig = px.line(metrics, x="FPR", y="TPR", title="ROC Curve", 
              width=600, height=600,
              render_mode="svg")
fig.add_scatter(x=[0,0,1], y=[0,1,1], mode='lines', 
                line_dash='dash', line_color='black',
                name="Perfect Classifier")
# move the legend inside the plot
fig.update_layout(legend=dict(x=.5, y=.1))
fig.update_layout(
  xaxis_title=dict(font=dict(size=22)),
  yaxis_title=dict(font=dict(size=22))
)
fig.show()

	TEAM_NAME	TEAM_NAME_OPP	MATCHUP	WON	WL	AST	GOAL_DIFF
GAME_ID
21700001	Boston Celtics	Cleveland Cavaliers	BOS @ CLE	0	L	24	-0.049
21700002	Golden State Warriors	Houston Rockets	GSW vs. HOU	0	L	34	0.053
21700003	Charlotte Hornets	Detroit Pistons	CHA @ DET	0	L	16	-0.030
21700004	Indiana Pacers	Brooklyn Nets	IND vs. BKN	1	W	29	0.041
21700005	Orlando Magic	Miami Heat	ORL vs. MIA	1	W	22	0.042
...	...	...	...	...	...	...	...
21701226	New Orleans Pelicans	San Antonio Spurs	NOP vs. SAS	1	W	30	0.189
21701227	Oklahoma City Thunder	Memphis Grizzlies	OKC vs. MEM	1	W	32	0.069
21701228	LA Clippers	Los Angeles Lakers	LAC vs. LAL	0	L	27	0.017
21701229	Utah Jazz	Portland Trail Blazers	UTA @ POR	0	L	18	-0.090
21701230	Houston Rockets	Sacramento Kings	HOU @ SAC	0	L	11	-0.097

	TEAM_NAME	TEAM_NAME_OPP	MATCHUP	WON	WL	AST	GOAL_DIFF	Predicted Class
GAME_ID
21700001	Boston Celtics	Cleveland Cavaliers	BOS @ CLE	0	L	24	-0.049	0
21700002	Golden State Warriors	Houston Rockets	GSW vs. HOU	0	L	34	0.053	1
21700003	Charlotte Hornets	Detroit Pistons	CHA @ DET	0	L	16	-0.030	0
21700004	Indiana Pacers	Brooklyn Nets	IND vs. BKN	1	W	29	0.041	1
21700005	Orlando Magic	Miami Heat	ORL vs. MIA	1	W	22	0.042	1

	Threshold	Accuracy	Precision	Recall
0	0.000000	0.49187	0.49187	1.0
1	0.001001	0.49187	0.49187	1.0
2	0.002002	0.49187	0.49187	1.0
3	0.003003	0.49187	0.49187	1.0
4	0.004004	0.49187	0.49187	1.0

	Threshold	Accuracy	Precision	Recall
488	0.488488	0.798374	0.793103	0.798347
489	0.489489	0.798374	0.793103	0.798347
490	0.490490	0.798374	0.793103	0.798347
485	0.485485	0.797561	0.788026	0.804959
486	0.486486	0.797561	0.788026	0.804959

🎯 Lecture 23 – Data 100, Spring 2025¶

Decision Boundaries¶

Two Features¶

Linear Separability¶

〰️ Regularization in `sklearn`¶

🎯 Performance Metrics¶

Confusion matrix¶

Precision and Recall¶

Precision and Recall¶

True and False Positive Rates¶

🎛️ Adjusting the Classification Threshold¶

Thresholds and Performance Metrics¶

Precision-Recall Curves¶

ROC Curves¶

🎯 Lecture 23 – Data 100, Spring 2025¶

Decision Boundaries¶

Two Features¶

Linear Separability¶

〰️ Regularization in sklearn¶

🎯 Performance Metrics¶

Confusion matrix¶

Precision and Recall¶

Precision and Recall¶

True and False Positive Rates¶

🎛️ Adjusting the Classification Threshold¶

Thresholds and Performance Metrics¶

Precision-Recall Curves¶

ROC Curves¶

〰️ Regularization in `sklearn`¶