import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates["plotly"].layout.colorway = px.colors.qualitative.Vivid
px.defaults.width = 800

from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.linear_model as lm

basketball = pd.read_csv("data/nba.csv")
first_team = basketball.groupby("GAME_ID").first()
second_team = basketball.groupby("GAME_ID").last()
games = first_team.merge(second_team, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = (games['WL'] == "W").astype(int)
games = games[['TEAM_NAME', 'TEAM_NAME_OPP', 'MATCHUP', 'WON', 'WL', 'AST', 'GOAL_DIFF']]
games

np.random.seed(42)
games["JitterWON"] = games["WON"] + np.random.uniform(-0.1, 0.1, len(games))
px.scatter(games, x="GOAL_DIFF", y="JitterWON", color="WL")

X = games[["GOAL_DIFF"]]
Y = games["WON"]

model = lm.LogisticRegression()
model.fit(X, Y)
print("Slope:", model.coef_[0][0])
print("Intercept:", model.intercept_[0])

Slope: 11.822010640307564
Intercept: -0.022954660860397533

# Preview the first 10 rows
model.predict_proba(X)[:10]

array([[0.64616706, 0.35383294],
       [0.35351778, 0.64648222],
       [0.59329774, 0.40670226],
       [0.38657128, 0.61342872],
       [0.38377167, 0.61622833],
       [0.59044202, 0.40955798],
       [0.7031446 , 0.2968554 ],
       [0.63253864, 0.36746136],
       [0.31157548, 0.68842452],
       [0.61590222, 0.38409778]])

model.classes_

array([0, 1])

# Obtain P(Y=1|x) from the output.
p = model.predict_proba(X)[:, 1]

# Apply decision rule: predict Class 1 if P(Y=1|x) >= 0.5.
(p >= 0.5).astype(int)

array([0, 1, 0, ..., 1, 0, 0])

# .predict will automatically apply a 0.5 threshold for a logistic regression model.
classes = model.predict(X)

classes

array([0, 1, 0, ..., 1, 0, 0])

 -model.intercept_[0]/model.coef_[0][0]

np.float64(0.0019416883945386416)

games["Predicted Class"] = pd.Categorical(classes)

test_points = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
test_points["Predicted Prob"] = model.predict_proba(test_points)[:, 1]

fig = px.scatter(games, x="GOAL_DIFF", y="JitterWON", color="Predicted Class")
# Add the logistic regression model predictions
fig.add_trace(go.Scatter(x=test_points["GOAL_DIFF"], y=test_points["Predicted Prob"], 
                         mode="lines", name="Logistic Regression Model", 
                         line_color="black", line_width=5, line_dash="dash"))
fig.add_vline(x = -model.intercept_[0]/model.coef_[0][0], line_dash="dash", 
              line_color="black",
              annotation_text="Decision Boundary", 
              annotation_position="right")

fig = px.scatter(games, x="GOAL_DIFF", y=np.zeros(len(games)),
                 symbol="WL", symbol_sequence=[ "circle-open", "cross"], 
                 color="Predicted Class", height=300, opacity=0.7)
# fig.update_traces(marker_symbol='line-ns-open')
fig.update_traces(marker_size=8)
fig.update_layout(
    yaxis=dict(showticklabels=False, showgrid=False, zeroline=False, title=""),
)

decision_boundary =  -model.intercept_[0]/model.coef_[0][0]
fig.add_vline(x = decision_boundary, line_dash="dash", 
              line_color="black",
              annotation_text="Decision Boundary", 
              annotation_position="top right")

X_two_feature = games[["GOAL_DIFF", "AST"]]
Y = games["WON"]

two_feature_model = lm.LogisticRegression()
two_feature_model.fit(X_two_feature, Y)

# This function plots the decision boundary such that AST is a function of GOAL_DIFF.
theta0 = two_feature_model.intercept_
theta1, theta2 = two_feature_model.coef_[0]
print(theta0, theta1, theta2)

[-2.11190144] 10.785697735287494 0.09027834044673858

games["Predicted Class"] = two_feature_model.predict(X_two_feature)
games.head()

# Construct the decision boundary
decision_boundary = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
decision_boundary["AST"] = (theta0 + theta1*decision_boundary["GOAL_DIFF"])/(-theta2)

games['Predicted Class'] = pd.Categorical(games['Predicted Class'])
fig = px.scatter(games, x="GOAL_DIFF", y="AST", symbol="WL", 
                 hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'],
                 color="Predicted Class", 
                 symbol_sequence=[ "circle-open", "cross"],
                 opacity=0.7,
                 height=600)
fig.update_traces(marker=dict(size=8))
fig.update_layout(xaxis_range=[-0.3, 0.3], yaxis_range=[5, 50])
# Add the decision boundary to the plot
fig.add_scatter(x=decision_boundary["GOAL_DIFF"], y=decision_boundary["AST"],
                mode="lines", line_color="black", line_dash="dash", 
                name="Decision Boundary")

goal_diff, ast = np.meshgrid(np.linspace(-0.3, 0.3, 50), np.linspace(5, 50, 50))
pred_grid = pd.DataFrame({"GOAL_DIFF": np.ravel(goal_diff), "AST": np.ravel(ast)})
pred_grid['Probability'] = two_feature_model.predict_proba(pred_grid)[:, 1]
# fig = go.Figure()
fig.add_contour(x=pred_grid['GOAL_DIFF'], y=pred_grid['AST'], z=pred_grid['Probability'],
                showscale=False, opacity=0.4, colorscale="Matter")

import seaborn as sns
iris = sns.load_dataset("iris")

fig = px.scatter(iris[iris["species"] != "virginica"], 
                 x = "petal_length",
                 y = "petal_width", 
                 color="species", 
                 symbol="species", symbol_sequence=[ "circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))
fig

fig = px.scatter(iris[iris["species"] != "setosa"], 
                 x = "petal_length",
                 y = "petal_width", 
                 color="species", 
                 symbol="species", symbol_sequence=[ "circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))
fig

toy_df = pd.DataFrame({"x": [-1, 1], "y": [0, 1], "label": pd.Categorical([0, 1])})
fig = px.scatter(toy_df, x="x", y="y", 
                 color="label", symbol="label", 
                 symbol_sequence=[ "circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))

def toy_model(theta1, x):
    return 1/(1 + np.exp(-theta1 * x))

def mean_cross_entropy_loss_toy(theta1):
    # Here we use 1 - sigma(z) = sigma(-z) to improve numerical stability.
    return - np.sum(toy_df['y'] * np.log(toy_model(theta1, toy_df['x'])) + \
                    (1-toy_df['y']) * np.log(toy_model(theta1, -toy_df['x'])))

thetas = np.linspace(-30, 30, 100)
fig = px.line(x=thetas, y = [mean_cross_entropy_loss_toy(theta) for theta in thetas], 
              render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Mean CE Loss",
                  title="Mean Cross Entropy Loss for Toy Example")

fig = px.line(x=thetas, y = [mean_cross_entropy_loss_toy(theta) for theta in thetas],
              log_y=True, render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Log Scale Mean CE Loss",
                  title="Log Scale Mean Cross Entropy Loss for Toy Example")

def regularized_loss_toy(theta1, reg):
    return mean_cross_entropy_loss_toy(theta1) + reg * theta1**2

reg = 0.01 # Small amount of regularization
fig = px.line(x=thetas, y = [regularized_loss_toy(theta, reg) for theta in thetas],
              render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Mean CE Loss",
                  title=f"Mean Cross Entropy Loss for Toy Example (Regularization = {reg})")

toy_model = lm.LogisticRegression(C=10)
toy_model.fit([[-1], [1]], [0,1])

xtest = np.linspace(-1.5, 1.5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:,1]

fig = px.scatter(toy_df, x="x", y="y", 
                 color="label", symbol="label", 
                 symbol_sequence=[ "circle", "cross"],
                 title=f"LR Fit (slope = {model.coef_[0][0]}, intercept = {model.intercept_[0]})",
                 render_mode="svg")
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="Logistic Regression Model", 
                line_color="black", line_width=5, line_dash="dash")

toy_model = lm.LogisticRegression(C=1000)
toy_model.fit([[-1], [1]], [0,1])

xtest = np.linspace(-1.5, 1.5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:,1]

fig = px.scatter(toy_df, x="x", y="y", 
                 color="label", symbol="label", 
                 symbol_sequence=[ "circle", "cross"],
                 title=f"LR Fit (slope = {model.coef_[0][0]}, intercept = {model.intercept_[0]})",
                 render_mode="svg")
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="Logistic Regression Model", 
                line_color="black", line_width=5, line_dash="dash")

def accuracy(X, Y):
    return np.mean(model.predict(X) == Y)

accuracy(X, Y)

np.float64(0.7943089430894309)

model.score(X, Y)

0.7943089430894309

from sklearn.metrics import confusion_matrix

# Be careful – confusion_matrix takes in y_true as the first parameter and y_pred as the second.
# Don't mix these up!
cm = confusion_matrix(Y, model.predict(X))
cm

array([[511, 114],
       [139, 466]])

fig = px.imshow(cm, x=["0", "1"], y=["0", "1"],
          labels=dict(x="Predicted", y="Actual"), 
          text_auto=True, 
          color_continuous_scale="Blues", 
          width=400, height=400)
fig.update_xaxes(side="top")

Y_hat = model.predict(X)
tp = np.sum((Y_hat == 1) & (Y == 1))
tn = np.sum((Y_hat == 0) & (Y == 0))

fp = np.sum((Y_hat == 1) & (Y == 0))
fn = np.sum((Y_hat == 0) & (Y == 1))


print("True Positives: ", tp)
print("True Negatives: ", tn)
print("False Positives:", fp)
print("False Negatives:", fn)

True Positives:  466
True Negatives:  511
False Positives: 114
False Negatives: 139

precision = tp / (tp + fp)
precision

np.float64(0.803448275862069)

recall = tp / (tp + fn)
recall

np.float64(0.7702479338842976)

fpr = fp/(fp + tn)
fpr

np.float64(0.1824)

tpr = tp/(tp + fn)
tpr

np.float64(0.7702479338842976)

X = games[["GOAL_DIFF"]]
Y = games["WON"]
model = lm.LogisticRegression()
model.fit(X, Y)
print("Slope:", model.coef_[0][0])
print("Intercept:", model.intercept_[0])

Slope: 11.822010640307564
Intercept: -0.022954660860397533

def plot_predictions(threshold = 0.5):
    games["Predicted Class"] = model.predict_proba(X)[:, 1] >= threshold
    # Needed for plotting
    games["Predicted Class"] = pd.Categorical(games["Predicted Class"])
    fig = px.scatter(games, 
            x="GOAL_DIFF", y="JitterWON", color="Predicted Class", 
            title=f"Logistic Regression Predictions (Threshold = {threshold})")
    # Add the logistic regression model predictions
    # Make the data points for the LR model curve
    test_points = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
    test_points["Predicted Prob"] = model.predict_proba(test_points)[:, 1]
    fig.add_trace(go.Scatter(x=test_points["GOAL_DIFF"], y=test_points["Predicted Prob"], 
                            mode="lines", name="Logistic Regression Model", 
                            line_color="black", line_width=5, line_dash="dash"))
    decision_boundary = (-np.log(1/threshold - 1) - model.intercept_[0])/model.coef_[0][0]
    fig.add_vline(x = decision_boundary, line_dash="dash", line_color="black",
                  annotation_text="Decision Boundary", annotation_position="right")
    return fig

plot_predictions(0.5)

plot_predictions(0.25)

plot_predictions(0.75)

# Define performance metrics dependent on the threshold value.
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def accuracy_threshold(X, Y, T):
    return np.mean(predict_threshold(model, X, T) == Y)

def precision_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    denominator = np.sum(Y_hat == 1)
    if denominator == 0:
        denominator = np.nan
    return np.sum((Y_hat == 1) & (Y == 1)) / denominator
    
def recall_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def tpr_threshold(X, Y, T): # Same as recall
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def fpr_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 0)) / np.sum(Y == 0)

metrics = pd.DataFrame()
metrics["Threshold"] = np.linspace(0, 1, 1000)
metrics["Accuracy"] = [accuracy_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["Precision"] = [precision_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["Recall"] = [recall_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics.head()

px.line(metrics, 
        x="Threshold", y="Accuracy",
        title="Accuracy vs. Threshold",
        render_mode="svg")

# The threshold that maximizes accuracy.
metrics.sort_values("Accuracy", ascending=False).head()

px.line(metrics, 
        x="Threshold", y=["Accuracy", "Precision", "Recall"],
        title="Performance Metrics vs. Threshold",
        render_mode="svg")

px.line(metrics, x="Recall", y="Precision",
        title="Precision vs. Recall",
        width=600, height=600,
        render_mode="svg")

metrics["F1"] = (2 * metrics["Precision"] * metrics["Recall"] 
                     / (metrics["Precision"] + metrics["Recall"]))
fig = px.line(metrics, x="Threshold", y="F1",
              title="Finding F1 Score Maximum",
              render_mode="svg")
ind = metrics['F1'].idxmax()
fig.add_scatter(x=[metrics.loc[ind, 'Threshold']], y=[metrics.loc[ind, 'F1']], 
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}",)

fig = px.line(metrics, x="Recall", y="Precision",
              title="Precision vs. Recall", width=600, height=600,
              render_mode="svg")
fig.add_scatter(x=[metrics.loc[ind, 'Recall']], y=[metrics.loc[ind, 'Precision']], 
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}")
fig.update_layout(legend=dict(x=.5, y=.1))

metrics["TPR"] = [tpr_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["FPR"] = [fpr_threshold(X, Y, t) for t in metrics["Threshold"]]

px.line(metrics, x="Threshold", y=["TPR", "FPR", "Accuracy"],
        render_mode="svg")

px.line(metrics, x="FPR", y="TPR", title="ROC Curve", 
        width=600, height=600,
        render_mode="svg")

fig = px.line(metrics, x="FPR", y="TPR", title="ROC Curve", 
              width=600, height=600,
              render_mode="svg")
fig.add_scatter(x=[0,0,1], y=[0,1,1], mode='lines', 
                line_dash='dash', line_color='black',
                name="Perfect Classifier")
# move the legend inside the plot
fig.update_layout(legend=dict(x=.5, y=.1))

	TEAM_NAME	TEAM_NAME_OPP	MATCHUP	WON	WL	AST	GOAL_DIFF
GAME_ID
21700001	Boston Celtics	Cleveland Cavaliers	BOS @ CLE	0	L	24	-0.049
21700002	Golden State Warriors	Houston Rockets	GSW vs. HOU	0	L	34	0.053
21700003	Charlotte Hornets	Detroit Pistons	CHA @ DET	0	L	16	-0.030
21700004	Indiana Pacers	Brooklyn Nets	IND vs. BKN	1	W	29	0.041
21700005	Orlando Magic	Miami Heat	ORL vs. MIA	1	W	22	0.042
...	...	...	...	...	...	...	...
21701226	New Orleans Pelicans	San Antonio Spurs	NOP vs. SAS	1	W	30	0.189
21701227	Oklahoma City Thunder	Memphis Grizzlies	OKC vs. MEM	1	W	32	0.069
21701228	LA Clippers	Los Angeles Lakers	LAC vs. LAL	0	L	27	0.017
21701229	Utah Jazz	Portland Trail Blazers	UTA @ POR	0	L	18	-0.090
21701230	Houston Rockets	Sacramento Kings	HOU @ SAC	0	L	11	-0.097

	TEAM_NAME	TEAM_NAME_OPP	MATCHUP	WON	WL	AST	GOAL_DIFF	JitterWON	Predicted Class
GAME_ID
21700001	Boston Celtics	Cleveland Cavaliers	BOS @ CLE	0	L	24	-0.049	-0.025092	0
21700002	Golden State Warriors	Houston Rockets	GSW vs. HOU	0	L	34	0.053	0.090143	1
21700003	Charlotte Hornets	Detroit Pistons	CHA @ DET	0	L	16	-0.030	0.046399	0
21700004	Indiana Pacers	Brooklyn Nets	IND vs. BKN	1	W	29	0.041	1.019732	1
21700005	Orlando Magic	Miami Heat	ORL vs. MIA	1	W	22	0.042	0.931204	1

	Threshold	Accuracy	Precision	Recall
0	0.000000	0.49187	0.49187	1.0
1	0.001001	0.49187	0.49187	1.0
2	0.002002	0.49187	0.49187	1.0
3	0.003003	0.49187	0.49187	1.0
4	0.004004	0.49187	0.49187	1.0

	Threshold	Accuracy	Precision	Recall
488	0.488488	0.798374	0.793103	0.798347
489	0.489489	0.798374	0.793103	0.798347
490	0.490490	0.798374	0.793103	0.798347
485	0.485485	0.797561	0.788026	0.804959
486	0.486486	0.797561	0.788026	0.804959

Lecture 23 – Data 100, Fall 2024¶

Decision Boundaries¶

Two Features¶

Linear Separability¶

Regularization in SK Learn¶

Performance Metrics¶

Confusion matrix¶

Precision and Recall¶

Precision and Recall¶

Precision and Recall¶

True and False Positive Rates¶

Adjusting the Classification Threshold¶

Thresholds and Performance Metrics¶

Precision-Recall Curves¶

ROC Curves¶