import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates["plotly"].layout.colorway = px.colors.qualitative.Vivid
px.defaults.width = 800

from scipy.optimize import minimize
import sklearn.linear_model as lm
from sklearn.metrics import r2_score

basketball = pd.read_csv("data/nba.csv")
basketball.head()

basketball = pd.read_csv("data/nba.csv")
first_team = basketball.groupby("GAME_ID").first()
second_team = basketball.groupby("GAME_ID").last()
games = first_team.merge(second_team, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = (games['WL'] == "W").astype(int)
games = games[['TEAM_NAME', 'TEAM_NAME_OPP', 'MATCHUP', 'WON', 'WL', 'GOAL_DIFF']]
games

px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])

px.strip(games, x="GOAL_DIFF", y="WL", color="WL", 
         hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'],
         category_orders={"WL": ["W", "L"]})
# sns.stripplot(data=games, x="GOAL_DIFF", y="WON", orient="h", order=[1, 0], hue="WON");

np.random.seed(42)
games["JitterWON"] = games["WON"] + np.random.uniform(-0.1, 0.1, len(games))
px.scatter(games, 
           x="GOAL_DIFF", y="JitterWON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])

# Fitting a linear regression model to the data
X = games[["GOAL_DIFF"]]
Y = games["WON"]
least_squares_model = lm.LinearRegression()
least_squares_model.fit(X, Y)

# Make some predictions for a range of GOAL_DIFF values
pred = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3)})
pred["LS_Pred"] = least_squares_model.predict(pred)

# Visualize the model
fig = px.scatter(games, 
           x="GOAL_DIFF", y="JitterWON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
fig.add_trace(go.Scatter(x=pred["GOAL_DIFF"], y=pred["LS_Pred"], 
                         mode="lines", name="Least Squares Fit"))

# Break our GOAL_DIFF values into 20 bins
bins, cuts = pd.cut(games["GOAL_DIFF"], 20, retbins=True)
    # Bins contains the lower, upper values for the interval for that row
    # Cuts contains the unique bin edges

# Lets look at a few of the bins
games.join(bins, rsuffix="_bins").head()

fig = px.scatter(games, 
           x="GOAL_DIFF", y="JitterWON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
for cut in cuts:
    fig.add_vline(x=cut, line_dash="dash", line_color="black")

fig.show()

# Compute the bin center for every game (dot in the above plot)
games['bin_center'] = bins.apply(lambda x: (x.left + x.right)/2).astype(float)
# Now all the games that are in the same bin will have the same bin_center. 
# We can group by bin center and compute the average of the label to 
# get the win rate for each bin.
win_rates_by_bin = (
    games[["bin_center", "WON"]]
    .groupby("bin_center") 
    .mean()
    .rename(columns={"WON": "Win Rate"})
)
win_rates_by_bin

# Visualize the model
fig = px.scatter(games, 
           x="GOAL_DIFF", y="JitterWON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'], 
                         mode="markers+lines", name="Win Rate by Bin"))
for cut in cuts:
    fig.add_vline(x=cut, line_dash="dash", line_color="black")

fig.show()

win_rates_by_bin.index.name = "GOAL_DIFF"
win_rates_by_bin["odds"] = win_rates_by_bin["Win Rate"]/(1-win_rates_by_bin["Win Rate"])
win_rates_by_bin

px.line(win_rates_by_bin, y="odds")

win_rates_by_bin["log(odds)"] = np.log(win_rates_by_bin["odds"])
px.line(win_rates_by_bin, y="log(odds)")

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=3, subplot_titles=("Win Rate", "Odds", "Log(Odds)"))
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'],  mode="markers+lines"), row=1, col=1)
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['odds'], mode="markers+lines"), row=1, col=2)
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['log(odds)'], mode="markers+lines"), row=1, col=3)
fig.update_layout(showlegend=False)

logistic_model = lm.LogisticRegression(C=20)
logistic_model.fit(X, Y)
pred["Logistic_Pred"] = logistic_model.predict_proba(pred[["GOAL_DIFF"]])[:,1]

# Visualize the model
fig = px.scatter(games, 
           x="GOAL_DIFF", y="JitterWON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
# Add the binned predictions
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'], 
                         mode="markers+lines", name="Win Rate by Bin"))
# Add the logistic regression model predictions
fig.add_trace(go.Scatter(x=pred["GOAL_DIFF"], y=pred["Logistic_Pred"], 
                         mode="lines", name="Logistic Regression Model", 
                         line_color="black", line_width=5, line_dash="dash"))

# for cut in cuts:
#     fig.add_vline(x=cut, line_dash="dash", line_color="black")

fig.show()

toy_df = pd.DataFrame({
        "x": [-4, -2, -0.5, 1, 3, 5],
        "y": [0, 0, 1, 0, 1, 1]
})
toy_df["str_y"] = toy_df["y"].astype(str)
toy_df.sort_values("x")

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
fig.update_traces(marker_size=20)

def sigmoid(z):
    return 1/(1+np.e**-z)

def mse_on_toy_data(theta):
    p_hat = sigmoid(toy_df['x'] * theta)
    return np.mean((toy_df['y'] - p_hat)**2)

theta_loss = pd.DataFrame({"theta": np.linspace(-10, 10, 100)})
theta_loss["MSE"] = theta_loss["theta"].apply(mse_on_toy_data)
px.line(theta_loss, x="theta", y="MSE", width=800,
        title="MSE on Toy Classification Data")

# Set the initial guess as theta = 0
best_theta = minimize(mse_on_toy_data, x0 = 0)["x"][0]
best_theta

np.float64(0.5446601825581691)

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta), 
    mode="lines", line_color="black", 
    name=f"LR Model: theta = {best_theta:.2f}"))
fig.update_traces(marker_size=20)

# Set the initial guess as theta = -5
best_theta_2 = minimize(mse_on_toy_data, x0 = -5)["x"][0]
best_theta_2

np.float64(-10.343653061026611)

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta_2), 
    mode="lines", line_color="black", 
    name=f"LR Model: theta = {best_theta_2:.2f}"))
fig.update_traces(marker_size=20)

fig = px.line(theta_loss, x="theta", y="MSE", width=800,
              title="MSE on Toy Classification Data")
fig.add_scatter(x=[best_theta], y=[mse_on_toy_data(best_theta)], 
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_1: {best_theta:.2f}")
fig.add_scatter(x=[best_theta_2], y=[mse_on_toy_data(best_theta_2)], 
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_2: {best_theta_2:.2f}")

p_hat_loss = pd.DataFrame({"p_hat": np.arange(0.001, 0.999, 0.01)}) 
p_hat_loss["L2 Loss"] = (1 - p_hat_loss["p_hat"])**2
px.line(p_hat_loss, x="p_hat", y="L2 Loss", width=800,
        title="Squared Loss for One Individual when y=1")

p_hat_loss["Neg Log Loss"] = -np.log(p_hat_loss["p_hat"])

px.line(p_hat_loss.melt(id_vars="p_hat", value_name="Loss"), 
        x="p_hat", y="Loss", color="variable", width=800,
        title="Loss Comparison for One Observation when y = 1")

p_hat_loss = pd.DataFrame({"p_hat": np.arange(0.001, 0.999, 0.01)}) 
p_hat_loss["L2 Loss"] = (1 - (1-p_hat_loss["p_hat"]))**2
p_hat_loss["Neg Log Loss"] = -np.log(1 - p_hat_loss["p_hat"])
px.line(p_hat_loss.melt(id_vars="p_hat", value_name="Loss"), 
        x="p_hat", y="Loss", color="variable", width=800,
        title="Loss Comparison for One Observation when y = 0")

def cross_entropy(y, p_hat):
    return - y * np.log(p_hat) - (1 - y) * np.log(1 - p_hat)
def mean_cross_entropy_on_toy_data(theta):
    p_hat = sigmoid(toy_df["x"] * theta)
    return np.mean(cross_entropy(toy_df["y"], p_hat))

theta_loss["Cross-Entropy"] = theta_loss["theta"].apply(mean_cross_entropy_on_toy_data).dropna()
px.line(theta_loss, x="theta", y="Cross-Entropy", width=800,
           title="Cross-Entropy on Toy Classification Data")

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

/opt/homebrew/Caskroom/mambaforge/base/envs/py_3_11/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning:

divide by zero encountered in log

def mean_cross_entropy_on_toy_data(theta):
    y = toy_df["y"]
    z = toy_df["x"] * theta
    # using the log1p numerically stable operation
    return -np.mean((y - 1) * z - np.log1p(np.exp(-z)))

theta_loss["Cross-Entropy"] = theta_loss["theta"].apply(mean_cross_entropy_on_toy_data).dropna()
px.line(theta_loss.melt(id_vars="theta", value_name="Loss"), 
        x="theta", y="Loss", color="variable",
           title="Cross-Entropy on Toy Classification Data")

best_ce_theta = minimize(mean_cross_entropy_on_toy_data, x0 = -5)["x"][0]
best_ce_theta

np.float64(0.7432351403820119)

fig = px.line(theta_loss.melt(id_vars="theta", value_name="Loss"), 
              x="theta", y="Loss", color="variable",
              title="Cross-Entropy on Toy Classification Data")
fig.add_scatter(x=[best_theta], y=[mse_on_toy_data(best_theta)], 
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_1: {best_theta:.2f}")
fig.add_trace(go.Scatter(x=[best_ce_theta], y=[mean_cross_entropy_on_toy_data(best_ce_theta)], 
                         mode="markers", marker_size=10, marker_color="Blue",
                         name=f"CE Theta: {best_ce_theta:.2f}"))

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta), 
    mode="lines", line_color="red", 
    name=f"LR + MSE Loss"))
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_ce_theta), 
    mode="lines", line_color="blue", 
    name=f"LR + CE Loss"))

fig.update_traces(marker_size=20)

	SEASON_ID	TEAM_ID	TEAM_ABBREVIATION	TEAM_NAME	GAME_ID	GAME_DATE	MATCHUP	WL	MIN	FGM	...	DREB	REB	AST	STL	BLK	TOV	PF	PTS	PLUS_MINUS	VIDEO_AVAILABLE
0	22017	1610612744	GSW	Golden State Warriors	21700002	2017-10-17	GSW vs. HOU	L	240	43	...	35	41	34	5	9	17	25	121	-1	1
1	22017	1610612745	HOU	Houston Rockets	21700002	2017-10-17	HOU @ GSW	W	240	47	...	33	43	28	9	5	13	16	122	1	1
2	22017	1610612738	BOS	Boston Celtics	21700001	2017-10-17	BOS @ CLE	L	240	36	...	37	46	24	11	4	12	24	99	-3	1
3	22017	1610612739	CLE	Cleveland Cavaliers	21700001	2017-10-17	CLE vs. BOS	W	240	38	...	41	50	19	3	4	17	25	102	3	1
4	22017	1610612750	MIN	Minnesota Timberwolves	21700011	2017-10-18	MIN @ SAS	L	240	37	...	31	42	23	7	4	13	16	99	-8	1

	TEAM_NAME	TEAM_NAME_OPP	MATCHUP	WON	WL	GOAL_DIFF
GAME_ID
21700001	Boston Celtics	Cleveland Cavaliers	BOS @ CLE	0	L	-0.049
21700002	Golden State Warriors	Houston Rockets	GSW vs. HOU	0	L	0.053
21700003	Charlotte Hornets	Detroit Pistons	CHA @ DET	0	L	-0.030
21700004	Indiana Pacers	Brooklyn Nets	IND vs. BKN	1	W	0.041
21700005	Orlando Magic	Miami Heat	ORL vs. MIA	1	W	0.042
...	...	...	...	...	...	...
21701226	New Orleans Pelicans	San Antonio Spurs	NOP vs. SAS	1	W	0.189
21701227	Oklahoma City Thunder	Memphis Grizzlies	OKC vs. MEM	1	W	0.069
21701228	LA Clippers	Los Angeles Lakers	LAC vs. LAL	0	L	0.017
21701229	Utah Jazz	Portland Trail Blazers	UTA @ POR	0	L	-0.090
21701230	Houston Rockets	Sacramento Kings	HOU @ SAC	0	L	-0.097

	TEAM_NAME	TEAM_NAME_OPP	MATCHUP	WON	WL	GOAL_DIFF	JitterWON	GOAL_DIFF_bins
GAME_ID
21700001	Boston Celtics	Cleveland Cavaliers	BOS @ CLE	0	L	-0.049	-0.025092	(-0.0648, -0.0382]
21700002	Golden State Warriors	Houston Rockets	GSW vs. HOU	0	L	0.053	0.090143	(0.0416, 0.0682]
21700003	Charlotte Hornets	Detroit Pistons	CHA @ DET	0	L	-0.030	0.046399	(-0.0382, -0.0116]
21700004	Indiana Pacers	Brooklyn Nets	IND vs. BKN	1	W	0.041	1.019732	(0.015, 0.0416]
21700005	Orlando Magic	Miami Heat	ORL vs. MIA	1	W	0.042	0.931204	(0.0416, 0.0682]

	Win Rate
bin_center
-0.2380	0.000000
-0.2110	0.000000
-0.1845	0.000000
-0.1580	0.000000
-0.1315	0.000000
-0.1047	0.033898
-0.0781	0.083333
-0.0515	0.148438
-0.0249	0.363636
0.0017	0.505747
0.0283	0.705128
0.0549	0.792793
0.0815	0.907407
0.1079	0.984615
0.1345	1.000000
0.1615	1.000000
0.1880	1.000000
0.2410	1.000000
0.2675	1.000000

	Win Rate	odds
GOAL_DIFF
-0.2380	0.000000	0.000000
-0.2110	0.000000	0.000000
-0.1845	0.000000	0.000000
-0.1580	0.000000	0.000000
-0.1315	0.000000	0.000000
-0.1047	0.033898	0.035088
-0.0781	0.083333	0.090909
-0.0515	0.148438	0.174312
-0.0249	0.363636	0.571429
0.0017	0.505747	1.023256
0.0283	0.705128	2.391304
0.0549	0.792793	3.826087
0.0815	0.907407	9.800000
0.1079	0.984615	64.000000
0.1345	1.000000	inf
0.1615	1.000000	inf
0.1880	1.000000	inf
0.2410	1.000000	inf
0.2675	1.000000	inf

Lecture 22 – Data 100, Fall 2024¶

Logistic Regression¶

Cross-Entropy Loss¶

Motivating Cross-Entropy Loss¶

	x	y	str_y
0	-4.0	0	0
1	-2.0	0	0
2	-0.5	1	1
3	1.0	0	0
4	3.0	1	1
5	5.0	1	1