import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates["plotly"].layout.colorway = px.colors.qualitative.Vivid
px.defaults.width = 800

# Set default plotly layout
pio.templates["plotly"].layout.font.size = 22



from scipy.optimize import minimize
import sklearn.linear_model as lm
from sklearn.metrics import r2_score

basketball = pd.read_csv("data/nba.csv")
basketball.head()

basketball = pd.read_csv("data/nba.csv")

# Extract the team names from each game-team, and organize the data so that 
# each row corresponds to a game, not a game-team combo.
first_team = basketball.groupby("GAME_ID").first()
second_team = basketball.groupby("GAME_ID").last()
games = first_team.merge(second_team, left_index = True, right_index = True, suffixes = ["", "_OPP"])

# Compute the field goal success rate
games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]

games['WON'] = (games['WL'] == "W").astype(int)
games = games[['TEAM_NAME', 'TEAM_NAME_OPP', 'MATCHUP', 'WON', 'WL', 'GOAL_DIFF']]

games

px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])

# Increase the transparency of the points
px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'],
           opacity=0.1
           )

# Fitting a linear regression model to the data
X = games[["GOAL_DIFF"]]
Y = games["WON"]
least_squares_model = lm.LinearRegression()
least_squares_model.fit(X, Y)

# Make some predictions for a range of GOAL_DIFF values
pred = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3)})
pred["LS_Pred"] = least_squares_model.predict(pred)

# Visualize the model
fig = px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", 
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'], 
           opacity=0.1)
fig.add_trace(go.Scatter(x=pred["GOAL_DIFF"], y=pred["LS_Pred"], 
                         mode="lines", name="Least Squares Fit"))

# Break our GOAL_DIFF values into 20 equally-spaced bins from min to max
# `bins` contains the lower, upper values for the interval for that row
# `cuts` contains the unique bin edges
# retbins=True tells pd.cut to provide the outermost bin edges
bins, cuts = pd.cut(games["GOAL_DIFF"], 20, retbins=True)

print('Edges of the 20 bins:')
print(cuts)
print()

print('First 5 GOAL_DIFF values:')
print(games["GOAL_DIFF"][:5])
print()

print('First 5 observations categorized into bins:')
print(bins[:5])

Edges of the 20 bins:
[-0.251532 -0.2244   -0.1978   -0.1712   -0.1446   -0.118    -0.0914
 -0.0648   -0.0382   -0.0116    0.015     0.0416    0.0682    0.0948
  0.1214    0.148     0.1746    0.2012    0.2278    0.2544    0.281   ]

First 5 GOAL_DIFF values:
GAME_ID
21700001   -0.049
21700002    0.053
21700003   -0.030
21700004    0.041
21700005    0.042
Name: GOAL_DIFF, dtype: float64

First 5 observations categorized into bins:
GAME_ID
21700001    (-0.0648, -0.0382]
21700002      (0.0416, 0.0682]
21700003    (-0.0382, -0.0116]
21700004       (0.015, 0.0416]
21700005      (0.0416, 0.0682]
Name: GOAL_DIFF, dtype: category
Categories (20, interval[float64, right]): [(-0.252, -0.224] < (-0.224, -0.198] < (-0.198, -0.171] < (-0.171, -0.145] ... (0.175, 0.201] < (0.201, 0.228] < (0.228, 0.254] < (0.254, 0.281]]

# Join the bins to the original data
games.join(bins, rsuffix="_bins").head()

fig = px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", opacity=0.1,
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])

for cut in cuts:
    fig.add_vline(x=cut, line_dash="dash", line_color="black", opacity=0.5)

fig.show()

# Compute the bin center for every game.
# This way, all the games that are in the same bin will have the same bin_center.
games['bin_center'] = bins.apply(lambda x: (x.left + x.right)/2).astype(float)

# We group by bin center and compute the average of the label to 
# get the win rate for each bin.
win_rates_by_bin = (
    games[["bin_center", "WON"]]
    .groupby("bin_center") 
    .mean()
    .rename(columns={"WON": "Win Rate"})
)
win_rates_by_bin

# Visualize the model
fig = px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", opacity=0.1,
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'], 
                         mode="markers+lines", name="Win Rate by Bin"))
for cut in cuts:
    fig.add_vline(x=cut, line_dash="dash", line_color="black", opacity=0.1)

fig.show()

# Plot the logarithm function
x = np.linspace(-1, 3, 100)
y = np.log(x)
fig = px.line(x=x, y=y)
fig.add_hline(y=0, line_color="black", line_dash="dash", opacity=0.5)
fig.add_vline(x=0, line_color="black", line_dash="dash", opacity=0.5)
fig.update_layout(
    xaxis_title="x",
    yaxis_title="log(x)",
    title="Logarithm Function",
    width=800,
    height=400
)

/tmp/ipykernel_357/2593233706.py:3: RuntimeWarning:

invalid value encountered in log

# Plot the logarithm function
x = np.linspace(-1, 3, 100)
y = np.exp(x)
fig = px.line(x=x, y=y)
fig.add_hline(y=0, line_color="black", line_dash="dash", opacity=0.5)
fig.add_vline(x=0, line_color="black", line_dash="dash", opacity=0.5)
fig.update_layout(
    xaxis_title="x",
    yaxis_title="log(x)",
    title="Exponential Function",
    width=800,
    height=400
)

# Plot the sigmoid function
x = np.linspace(-10, 10, 1000)
y = 1/(1+np.exp(-x))
fig = px.line(x=x, y=y)
fig.add_hline(y=0, line_color="black", line_dash="dash", opacity=0.5)
fig.add_vline(x=0, line_color="black", line_dash="dash", opacity=0.5)
fig.update_layout(
    xaxis_title="x",
    yaxis_title="log(x)",
    title="Sigmoid Function",
    width=800,
    height=400
)

logistic_model = lm.LogisticRegression(C=20)
logistic_model.fit(X, Y)
pred["Logistic_Pred"] = logistic_model.predict_proba(pred[["GOAL_DIFF"]])[:,1]

# Visualize a logistic regression model superimposed on the data
fig = px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", opacity=0.1,
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
# Add the binned predictions
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'], 
                         mode="markers+lines", name="Win Rate by Bin"))
# Add the logistic regression model predictions
fig.add_trace(go.Scatter(x=pred["GOAL_DIFF"], y=pred["Logistic_Pred"], 
                         mode="lines", name="Logistic Regression Model", 
                         line_color="black"))
fig.show()

# Plot a 3D sigmoid surface
x1 = np.linspace(-10, 10, 100)
x2 = np.linspace(-10, 10, 100)
X1, X2 = np.meshgrid(x1, x2)
Y = 1/(1 + np.exp(-(X1 + X2)))

# Create a 3D surface plot for the sigmoid function
fig = go.Figure(data=[go.Surface(z=Y, x=X1[0], y=X2[:, 0])])

# Update layout for better visualization
fig.update_layout(
  title="Sigmoid with two inputs",
  scene=dict(
    xaxis_title="X1",
    yaxis_title="X2",
    zaxis_title="\u03C3(X1, X2)"
  ),
  width=800,
  height=600
)

# Reduce tick label size
fig.update_layout(
    scene=dict(
        xaxis=dict(tickfont=dict(size=14)),
        yaxis=dict(tickfont=dict(size=14)),
        zaxis=dict(tickfont=dict(size=14))
    )
)

fig.show()

# Simplified version of earlier plot used in the lecture slides
fig = px.scatter(games, 
           x="GOAL_DIFF", y="WON", color="WL", opacity=0.1,
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'],
           # change x label
           labels={"GOAL_DIFF": "X",
                   "WON": "P(Y=1|X)"},
           )

# Add the logistic regression model predictions
fig.add_trace(go.Scatter(x=pred["GOAL_DIFF"], y=pred["Logistic_Pred"], 
                         mode="lines", name="Logistic Regression Model", 
                         line_color="black"))

# Lecture figure of the model with the log odds outcome
fig = px.line(x=pred["GOAL_DIFF"], 
  y=np.log(pred["Logistic_Pred"] / (1-pred["Logistic_Pred"])),
  labels={"x": "X", "y": "log Odds(Y=1|X)"})

fig.update_traces(line=dict(color='black'))

# add horizontal line at y=0
fig.add_hline(y=0, line_color="black", line_dash="dash", opacity=0.5)

fig.show()

toy_df = pd.DataFrame({
        "x": [-4, -2, -0.5, 1, 3, 5],
        "y": [0, 0, 1, 0, 1, 1]
})
toy_df["str_y"] = toy_df["y"].astype(str)
toy_df.sort_values("x")

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
fig.update_traces(marker_size=20)

# Use the sigmoid function as the example model
def sigmoid(x_theta):
    return 1/(1+np.e**-x_theta)

# Compute the MSE of the fitted model for a given theta
def mse_on_toy_data(theta):
    p_hat = sigmoid(toy_df['x'] * theta)
    return np.mean((toy_df['y'] - p_hat)**2)

# Compute the MSE for a range of theta values
theta_loss = pd.DataFrame({"theta": np.linspace(-10, 10, 100)})
theta_loss["MSE"] = theta_loss["theta"].apply(mse_on_toy_data)
px.line(theta_loss, x="theta", y="MSE", width=800,
        title="MSE on Toy Classification Data")

# Set the initial guess as theta = 0
best_theta = minimize(mse_on_toy_data, x0 = 0)["x"][0]
best_theta

0.5446601851078255

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta), 
    mode="lines", line_color="black", 
    name=f"LR Model: theta = {best_theta:.2f}"))
fig.update_traces(marker_size=20)

# Set the initial guess as theta = -5
best_theta_2 = minimize(mse_on_toy_data, x0 = -5)["x"][0]
best_theta_2

-10.343653061026611

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta_2), 
    mode="lines", line_color="black", 
    name=f"LR Model: theta = {best_theta_2:.2f}"))
fig.update_traces(marker_size=20)

fig = px.line(theta_loss, x="theta", y="MSE", width=800,
              title="MSE on Toy Classification Data")
fig.add_scatter(x=[best_theta], y=[mse_on_toy_data(best_theta)], 
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_1: {best_theta:.2f}")
fig.add_scatter(x=[best_theta_2], y=[mse_on_toy_data(best_theta_2)], 
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_2: {best_theta_2:.2f}")

p_hat_loss = pd.DataFrame({"p_hat": np.arange(0.001, 0.999, 0.01)}) 
p_hat_loss["L2 Loss"] = (1 - p_hat_loss["p_hat"])**2
px.line(p_hat_loss, x="p_hat", y="L2 Loss", width=800,
        title="Squared Loss for One Individual when y=1")

p_hat_loss["Neg Log Loss"] = -np.log(p_hat_loss["p_hat"])

px.line(p_hat_loss.melt(id_vars="p_hat", value_name="Loss"), 
        x="p_hat", y="Loss", color="variable", width=800,
        title="Loss Comparison for One Observation when y = 1")

p_hat_loss = pd.DataFrame({"p_hat": np.arange(0.001, 0.999, 0.01)}) 
p_hat_loss["L2 Loss"] = (1 - (1-p_hat_loss["p_hat"]))**2
p_hat_loss["Neg Log Loss"] = -np.log(1 - p_hat_loss["p_hat"])
px.line(p_hat_loss.melt(id_vars="p_hat", value_name="Loss"), 
        x="p_hat", y="Loss", color="variable", width=800,
        title="Loss Comparison for One Observation when y = 0")

# Compute the cross entropy loss for a single data point and prediction
def cross_entropy(y, p_hat):
    return - y * np.log(p_hat) - (1 - y) * np.log(1 - p_hat)

# Compute the average cross entropy loss for a particular theta
def mean_cross_entropy_on_toy_data(theta):
    p_hat = sigmoid(toy_df["x"] * theta)
    return np.mean(cross_entropy(toy_df["y"], p_hat))

theta_loss["Cross-Entropy"] = theta_loss["theta"].apply(mean_cross_entropy_on_toy_data).dropna()
px.line(theta_loss, x="theta", y="Cross-Entropy", width=800,
           title="Cross-Entropy on Toy Classification Data")

def mean_cross_entropy_on_toy_data(theta):
    y = toy_df["y"]
    z = toy_df["x"] * theta
    # using the log1p numerically stable operation
    return -np.mean((y - 1) * z - np.log1p(np.exp(-z)))

theta_loss["Cross-Entropy"] = theta_loss["theta"].apply(mean_cross_entropy_on_toy_data).dropna()
px.line(theta_loss.melt(id_vars="theta", value_name="Loss"), 
        x="theta", y="Loss", color="variable",
           title="Cross-Entropy on Toy Classification Data")

best_ce_theta = minimize(mean_cross_entropy_on_toy_data, x0 = -5)["x"][0]
best_ce_theta

0.7432351539080401

fig = px.line(theta_loss.melt(id_vars="theta", value_name="Loss"), 
              x="theta", y="Loss", color="variable",
              title="Cross-Entropy on Toy Classification Data")
fig.add_scatter(x=[best_theta], y=[mse_on_toy_data(best_theta)], 
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_1: {best_theta:.2f}")
fig.add_trace(go.Scatter(x=[best_ce_theta], y=[mean_cross_entropy_on_toy_data(best_ce_theta)], 
                         mode="markers", marker_size=10, marker_color="Blue",
                         name=f"CE Theta: {best_ce_theta:.2f}"))

fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10, 10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta), 
    mode="lines", line_color="red", 
    name=f"LR + MSE Loss"))
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_ce_theta), 
    mode="lines", line_color="blue", 
    name=f"LR + CE Loss"))

fig.update_traces(marker_size=20)

	SEASON_ID	TEAM_ID	TEAM_ABBREVIATION	TEAM_NAME	GAME_ID	GAME_DATE	MATCHUP	WL	MIN	FGM	...	DREB	REB	AST	STL	BLK	TOV	PF	PTS	PLUS_MINUS	VIDEO_AVAILABLE
0	22017	1610612744	GSW	Golden State Warriors	21700002	2017-10-17	GSW vs. HOU	L	240	43	...	35	41	34	5	9	17	25	121	-1	1
1	22017	1610612745	HOU	Houston Rockets	21700002	2017-10-17	HOU @ GSW	W	240	47	...	33	43	28	9	5	13	16	122	1	1
2	22017	1610612738	BOS	Boston Celtics	21700001	2017-10-17	BOS @ CLE	L	240	36	...	37	46	24	11	4	12	24	99	-3	1
3	22017	1610612739	CLE	Cleveland Cavaliers	21700001	2017-10-17	CLE vs. BOS	W	240	38	...	41	50	19	3	4	17	25	102	3	1
4	22017	1610612750	MIN	Minnesota Timberwolves	21700011	2017-10-18	MIN @ SAS	L	240	37	...	31	42	23	7	4	13	16	99	-8	1

	TEAM_NAME	TEAM_NAME_OPP	MATCHUP	WON	WL	GOAL_DIFF
GAME_ID
21700001	Boston Celtics	Cleveland Cavaliers	BOS @ CLE	0	L	-0.049
21700002	Golden State Warriors	Houston Rockets	GSW vs. HOU	0	L	0.053
21700003	Charlotte Hornets	Detroit Pistons	CHA @ DET	0	L	-0.030
21700004	Indiana Pacers	Brooklyn Nets	IND vs. BKN	1	W	0.041
21700005	Orlando Magic	Miami Heat	ORL vs. MIA	1	W	0.042
...	...	...	...	...	...	...
21701226	New Orleans Pelicans	San Antonio Spurs	NOP vs. SAS	1	W	0.189
21701227	Oklahoma City Thunder	Memphis Grizzlies	OKC vs. MEM	1	W	0.069
21701228	LA Clippers	Los Angeles Lakers	LAC vs. LAL	0	L	0.017
21701229	Utah Jazz	Portland Trail Blazers	UTA @ POR	0	L	-0.090
21701230	Houston Rockets	Sacramento Kings	HOU @ SAC	0	L	-0.097

	Win Rate
bin_center
-0.2380	0.000000
-0.2110	0.000000
-0.1845	0.000000
-0.1580	0.000000
-0.1315	0.000000
-0.1047	0.033898
-0.0781	0.083333
-0.0515	0.148438
-0.0249	0.363636
0.0017	0.505747
0.0283	0.705128
0.0549	0.792793
0.0815	0.907407
0.1079	0.984615
0.1345	1.000000
0.1615	1.000000
0.1880	1.000000
0.2410	1.000000
0.2675	1.000000

	x	y	str_y
0	-4.0	0	0
1	-2.0	0	0
2	-0.5	1	1
3	1.0	0	0
4	3.0	1	1
5	5.0	1	1

⛹️ Lecture 22, Logistic Regression I – Data 100, Spring 2025¶

🪙 Logistic Regression¶

🗑️ Binning and Averaging¶

🪗 Modeling the S-shaped curve¶

⚔️ Cross-Entropy Loss¶

Motivating Cross-Entropy Loss¶