import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['lines.linewidth'] = 3
sns.set()

from scipy.optimize import minimize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.linear_model as lm
from sklearn.metrics import r2_score


basketball = pd.read_csv("nba.csv")

one_team = basketball.groupby("GAME_ID").first()
opponent = basketball.groupby("GAME_ID").last()
games = one_team.merge(opponent, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = games['WL'].replace('L', 0).replace('W', 1)
games = games[['TEAM_NAME', 'MATCHUP', 'WON', 'GOAL_DIFF']]
games


sns.scatterplot(data=games, x="GOAL_DIFF", y="WON");


sns.stripplot(data=games, x="GOAL_DIFF", y="WON", orient="h", order=[1, 0]);


X = games[["GOAL_DIFF"]]
Y = games["WON"]

least_squares_model = lm.LinearRegression()
least_squares_model.fit(X, Y)

sns.stripplot(data=games, x="GOAL_DIFF", y="WON", orient="h")
xs = np.linspace(-0.3, 0.3)
plt.plot(xs, least_squares_model.predict(xs[:, np.newaxis]), c="tab:green")
plt.gca().invert_yaxis();

/srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(


bins = pd.cut(games["GOAL_DIFF"], 20)
games["bin"] = [(b.left + b.right) / 2 for b in bins]
win_rates_by_bin = games.groupby("bin")["WON"].mean()

# alpha makes the points transparent so we can see the curve more clearly
sns.stripplot(data=games, x="GOAL_DIFF", y="WON", orient="h", alpha=0.5)
plt.plot(win_rates_by_bin.index, win_rates_by_bin, c="tab:red")
plt.gca().invert_yaxis();


odds = win_rates_by_bin/(1-win_rates_by_bin)
plt.plot(odds.index, odds, c="tab:blue")
plt.xlabel("GOAL_DIFF")
plt.ylabel("Odds");


log_odds = np.log(odds)
plt.plot(log_odds.index, log_odds, c="tab:green")
plt.xlabel("GOAL_DIFF")
plt.ylabel(r"Log(Odds) =$\log{(\frac{p}{1-p})}$");

/srv/conda/envs/notebook/lib/python3.9/site-packages/pandas/core/arraylike.py:397: RuntimeWarning: divide by zero encountered in log
  result = getattr(ufunc, method)(*inputs, **kwargs)


logistic_model = lm.LogisticRegression(C=20)
logistic_model.fit(X, Y)
predicted_prob = logistic_model.predict_proba(xs[:, np.newaxis])[:, 1]

sns.stripplot(data=games, x="GOAL_DIFF", y="WON", orient="h", alpha=0.5)
plt.plot(xs, predicted_prob, c="k", lw=3, label="Logistic regression model")
plt.plot(win_rates_by_bin.index, win_rates_by_bin, lw=2, c="tab:red", label="Graph of averages")
plt.legend(loc="upper left")
plt.gca().invert_yaxis();

/srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LogisticRegression was fitted with feature names
  warnings.warn(


toy_df = pd.DataFrame({
        "x": [-4, -2, -0.5, 1, 3, 5],
        "y": [0, 0, 1, 0, 1, 1]
})
toy_df.sort_values("x")


sns.scatterplot(data=toy_df, x='x', y='y', s=100)
plt.title("Toy classification data");


def sigmoid(z):
    return 1/(1+np.e**-z)

def mse_on_toy_data(theta):
    p_hat = sigmoid(toy_df['x'] * theta)
    return np.mean((toy_df['y'] - p_hat)**2)

thetas = np.linspace(-10, 10, 100)
plt.plot(thetas, [mse_on_toy_data(theta) for theta in thetas])
plt.title("MSE on toy classification data")
plt.xlabel(r'$\theta$')
plt.ylabel('MSE');


# Set the initial guess as theta = 0
best_theta = minimize(mse_on_toy_data, x0 = 0)["x"][0]
best_theta

0.5446601825581691


sns.scatterplot(data=toy_df, x='x', y='y', s=100, label='y')
xs = np.linspace(-10, 10, 100)
plt.plot(xs, sigmoid(xs * best_theta), color='orange', label=r'$\sigma(x^T \theta)$')
plt.xlabel('x')
plt.legend()
plt.title("Model with " + r'$\hat{\theta} = $' + f"{best_theta:.4}");


# Set the initial guess as theta = -5
best_theta_2 = minimize(mse_on_toy_data, x0 = -5)["x"][0]
best_theta_2

-10.343653061026611


sns.scatterplot(data=toy_df, x='x', y='y', s=100, label='y')
xs = np.linspace(-10, 10, 100)
plt.plot(xs, sigmoid(xs * best_theta_2), color='orange', label=r'$\sigma(x^T \theta)$')
plt.xlabel('x')
plt.legend()
plt.title("Model with " + r'$\hat{\theta} = $' + f"{best_theta_2:.4}");


plt.plot(thetas, [mse_on_toy_data(theta) for theta in thetas])
plt.scatter([best_theta, best_theta_2], [mse_on_toy_data(best_theta), mse_on_toy_data(best_theta_2)], c="tab:red")
plt.title("MSE on toy classification data")
plt.xlabel(r'$\theta$')
plt.ylabel('MSE');


p_hat = np.arange(0.001, 0.999, 0.01)
loss = (1 - p_hat)**2
plt.plot(p_hat, loss, color='k')
plt.xlabel(r'$\sigma({x^T \theta})$')
plt.ylabel(r'$(1 - \hat{y})^2$')
plt.title('Squared Loss for One Individual');


p_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(p_hat)
plt.plot(p_hat, loss, color='k')
plt.xlabel('$p$: Probability that y is 1')
plt.ylabel('$-\log(p)$')
plt.title('Cross-Entropy Loss for one observation when $y = 1$');


p_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(1 - p_hat)
plt.plot(p_hat, loss, color='k')
plt.xlabel('$p$: Probability that y is 1')
plt.ylabel('$-\log(1 - p)$')
plt.title('Cross-Entropy Loss for one observation when $y = 0$');


def cross_entropy(y, p_hat):
    return - y * np.log(p_hat) - (1 - y) * np.log(1 - p_hat)


def mean_cross_entropy_on_toy_data(theta):
    p_hat = sigmoid(toy_df['x'] * theta)
    return np.mean(cross_entropy(toy_df['y'], p_hat))


thetas = np.linspace(-4, 4, 100)
plt.plot(thetas, [mean_cross_entropy_on_toy_data(theta) for theta in thetas], color = 'green')
plt.ylabel(r'Mean Cross-Entropy Loss($\theta$)')
plt.xlabel(r'$\theta$');

	TEAM_NAME	MATCHUP	WON	GOAL_DIFF
GAME_ID
21700001	Boston Celtics	BOS @ CLE	0	-0.049
21700002	Golden State Warriors	GSW vs. HOU	0	0.053
21700003	Charlotte Hornets	CHA @ DET	0	-0.030
21700004	Indiana Pacers	IND vs. BKN	1	0.041
21700005	Orlando Magic	ORL vs. MIA	1	0.042
...	...	...	...	...
21701226	New Orleans Pelicans	NOP vs. SAS	1	0.189
21701227	Oklahoma City Thunder	OKC vs. MEM	1	0.069
21701228	LA Clippers	LAC vs. LAL	0	0.017
21701229	Utah Jazz	UTA @ POR	0	-0.090
21701230	Houston Rockets	HOU @ SAC	0	-0.097

	x	y
0	-4.0	0
1	-2.0	0
2	-0.5	1
3	1.0	0
4	3.0	1
5	5.0	1

Logistic Regression I¶

Logistic Regression¶

Cross-Entropy Loss¶

Motivating Cross-Entropy Loss¶