import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from scipy.optimize import minimize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.linear_model as lm


basketball = pd.read_csv("nba.csv")

one_team = basketball.groupby("GAME_ID").first()
opponent = basketball.groupby("GAME_ID").last()
games = one_team.merge(opponent, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = games['WL'].replace('L', 0).replace('W', 1)

games = games[['TEAM_NAME', 'MATCHUP', 'WON', 'GOAL_DIFF', 'AST']].sort_values("GOAL_DIFF")
games


sns.stripplot(data=games, x="GOAL_DIFF", y="WON", orient="h", order=[1, 0], color=sns.color_palette()[0]);


X = games[["GOAL_DIFF"]]
Y = games["WON"]

model = lm.LogisticRegression()
model.fit(X, Y)

# Preview the first 10 rows
model.predict_proba(X)[:10]

array([[0.9521269 , 0.0478731 ],
       [0.94399293, 0.05600707],
       [0.94208808, 0.05791192],
       [0.94208808, 0.05791192],
       [0.93384531, 0.06615469],
       [0.9140087 , 0.0859913 ],
       [0.91213212, 0.08786788],
       [0.90526966, 0.09473034],
       [0.904251  , 0.095749  ],
       [0.90007786, 0.09992214]])


model.classes_

array([0, 1])


# Grab P(Y=1|x) from the output
p = model.predict_proba(X)[:, 1]

# Apply decision rule: predict Class 1 if P(Y=1|x) >= 0.5
(p >= 0.5).astype(int)

array([0, 0, 0, ..., 1, 1, 1])


# .predict will automatically apply a 0.5 threshold for a logistic regression model
classes = model.predict(X)

classes

array([0, 0, 0, ..., 1, 1, 1])


games["Predicted Class"] = classes

def sigmoid(z):
    return 1/(1+np.exp(-z))

x = np.linspace(-0.3, 0.3)
sns.stripplot(data=games, x="GOAL_DIFF", y="WON", hue="Predicted Class", orient="h")
plt.plot(x, sigmoid(model.intercept_ + model.coef_[0]*x), "k", label="P(Y=1|x)")
plt.gca().invert_yaxis();


sns.stripplot(data=games, x="GOAL_DIFF", y="WON", hue="Predicted Class", orient="h")
plt.plot(x, sigmoid(model.intercept_ + model.coef_[0]*x), "k", label="P(Y=1|x)")
plt.plot([-0.3, 0], [0.5, 0.5], "gray")
plt.plot([0, 0], [1, 0], "gray")
plt.gca().invert_yaxis()
plt.annotate("P(Y=1|x) = 0.5", (-0.3, 0.55));


# Determine the decision boundary
theta0 = model.intercept_
theta1 = model.coef_[0]
T = 0.5
db = (1/theta1)*(-np.log(1/T - 1) - theta0)

# Visualize the classified data
sns.rugplot(data=games, x="GOAL_DIFF", hue="Predicted Class")
plt.scatter(x=[db], y=[0.005], c="k", s=100)
plt.ylim(0, 0.1);


X_two_feature = games[["GOAL_DIFF", "AST"]]
Y = games["WON"]

two_feature_model = lm.LogisticRegression()
two_feature_model.fit(X_two_feature, Y)


# This function plots the decision boundary such that AST is a function of GOAL_DIFF
theta0 = two_feature_model.intercept_
theta1, theta2 = two_feature_model.coef_[0]
T = 0.5
db = lambda goal_diff: (1/theta2)*(-np.log(1/T - 1) - theta1*goal_diff - theta0)

games["Predicted Class Two Features"] = two_feature_model.predict(X_two_feature)
sns.scatterplot(data=games, x="GOAL_DIFF", y="AST", hue="Predicted Class Two Features")
plt.plot(x, db(x), "k");


# This time, visualize the true classes
sns.scatterplot(data=games, x="GOAL_DIFF", y="AST", hue="WON")
plt.plot(x, db(x), "k");


iris = sns.load_dataset('iris')

plt.figure(figsize=(6, 4))

# Separable
sns.scatterplot(data = iris[iris['species'] != 'virginica'],
               x = 'petal_length',
               y = 'petal_width',
               hue = 'species', s=100);
plt.gca().legend_.set_title(None)


# Not separable
plt.figure(figsize=(6, 4))
sns.scatterplot(data = iris[iris['species'] != 'setosa'],
               x = 'petal_length',
               y = 'petal_width',
               palette=sns.color_palette()[1:3],
               hue = 'species', s=100);
plt.gca().legend_.set_title(None)


toy_df = pd.DataFrame({"x": [-1, 1], "y": [0, 1]})
sns.scatterplot(data=toy_df, x='x', y='y', hue="y", s=100, legend=None);


def toy_model(theta1, x):
    return 1/(1 + np.exp(-theta1 * x))

def mean_cross_entropy_loss_toy(theta1):
    # Here we use 1 - sigma(z) = sigma(-z) to improve numerical stability
    return - np.sum(toy_df['y'] * np.log(toy_model(theta1, toy_df['x'])) + \
                    (1-toy_df['y']) * np.log(toy_model(theta1, -toy_df['x'])))


thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [mean_cross_entropy_loss_toy(theta) for theta in thetas], color = 'green')
plt.ylabel(r'Mean Cross Enropy Loss($\theta$)')
plt.xlabel(r'$\theta$');
plt.title("Mean Cross Entropy Loss Surface");


[mean_cross_entropy_loss_toy(theta) for theta in thetas][-10:]


def mce_regularized_loss_single_arg_toy(theta, reg):
    return mce_loss_single_arg_toy(theta) + reg * theta**2 

def regularized_loss_toy(theta1, reg):
    return mean_cross_entropy_loss_toy(theta1) + reg * theta1**2

thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [regularized_loss_toy(theta, 0.1) for theta in thetas], color = 'green')
plt.ylabel(r'MCE($\theta$) + 0.1 $\theta^2$')
plt.xlabel(r'$\theta$');
plt.title(r"Mean Loss + L2 Regularization ($\lambda$ = 0.1)");


def accuracy(X, Y):
    return np.mean(model.predict(X) == Y)

accuracy(X, Y)


model.score(X, Y)


from sklearn.metrics import confusion_matrix

# Be careful – confusion_matrix takes in y_true as the first parameter and y_pred as the second.
# Don't mix these up!
cm = confusion_matrix(Y, model.predict(X))
cm


cm = confusion_matrix(Y, model.predict(X))
plt.figure(figsize=(4,4))
sns.heatmap(cm, annot=True, fmt = 'd', cmap = 'Blues', annot_kws = {'size': 16})
plt.xlabel('Predicted')
plt.ylabel('Actual');


Y_hat = model.predict(X)
tp = np.sum((Y_hat == 1) & (Y == 1))
tn = np.sum((Y_hat == 0) & (Y == 0))

fp = np.sum((Y_hat == 1) & (Y == 0))
fn = np.sum((Y_hat == 0) & (Y == 1))

tp, tn, fp, fn


precision = tp / (tp + fp)
precision


recall = tp / (tp + fn)
recall


fpr = fp/(fp + tn)
fpr


tpr = tp/(tp + fn)
tpr


sns.stripplot(data=games, x="GOAL_DIFF", y="WON", orient="h", order=[1, 0], color=sns.color_palette()[0]);


model = lm.LogisticRegression()
model.fit(X, Y)

p = model.predict_proba(X)[:, 1]
# Predict Class 1 if the predicted probability >= 0.25
y_hat = (p >= 0.25).astype(int)


theta0 = model.intercept_
theta1 = model.coef_[0]
decision_boundary_T25 = (1/theta1)*(-np.log(1/0.25 - 1) - theta0)

x = np.linspace(-0.3, 0.3)
sns.stripplot(x=games["GOAL_DIFF"], y=games["WON"], hue=y_hat, orient="h")
plt.plot(x, sigmoid(model.intercept_ + model.coef_[0]*x), "k", label="P(Y=1|x)")
plt.gca().invert_yaxis()

plt.annotate(f"Decision Boundary:\nx = {np.round(decision_boundary_T25[0], 3)}", (-0.3, 0.5));


y_hat = (p > 0.75).astype(int)

decision_boundary_T75 = (1/theta1)*(-np.log(1/0.75 - 1) - theta0)

x = np.linspace(-0.3, 0.3)
sns.stripplot(x=games["GOAL_DIFF"], y=games["WON"], hue=y_hat, orient="h")
plt.plot(x, sigmoid(model.intercept_ + model.coef_[0]*x), "k", label="P(Y=1|x)")
plt.gca().invert_yaxis()

plt.annotate(f"Decision Boundary:\nx = {np.round(decision_boundary_T75[0], 3)}", (0.1, 0.5));


# Define performance metrics dependent on the threshold value

def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def accuracy_threshold(X, Y, T):
    return np.mean(predict_threshold(model, X, T) == Y)

def precision_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y_hat == 1)
    
def recall_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def tpr_threshold(X, Y, T): # Same as recall
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def fpr_threshold(X, Y, T):
    Y_hat = predict_threshold(model, X, T)
    return np.sum((Y_hat == 1) & (Y == 0)) / np.sum(Y == 0)


# Compute accuracies for different thresholds
thresholds = np.linspace(0, 1, 100)
accs = [accuracy_threshold(X, Y, t) for t in thresholds]


plt.plot(thresholds, accs)
plt.xlabel("Threshold")
plt.ylabel("Accuracy");


# The threshold that maximizes accuracy
thresholds[np.argmax(accs)]


precisions = [precision_threshold(X, Y, t) for t in thresholds]
recalls = [recall_threshold(X, Y, t) for t in thresholds]

plt.plot(thresholds, precisions, label="Precision")
plt.plot(thresholds, recalls, label="Recall")
plt.xlabel("Threshold")
plt.ylabel("Precision/Recall")
plt.legend();


plt.plot(recalls, precisions)
plt.xlabel("Recall")
plt.ylabel("Precision");


tprs = [tpr_threshold(X, Y, t) for t in thresholds]
fprs = [fpr_threshold(X, Y, t) for t in thresholds]

plt.plot(thresholds, tprs, label="True Positive Rate")
plt.plot(thresholds, fprs, label="False Positive Rate")
plt.xlabel("Threshold")
plt.ylabel("TPR/FPR")
plt.legend();


plt.plot(fprs, tprs)
plt.xlabel("FPR")
plt.ylabel("TPR");


plt.plot(fprs, tprs, label="Our classifier")
plt.plot([0, 0, 1], [0, 1, 1], label="Perfect Classifier")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend();

	TEAM_NAME	MATCHUP	WON	GOAL_DIFF	AST
GAME_ID
21701216	Dallas Mavericks	DAL vs. PHX	0	-0.251	20
21700846	Phoenix Suns	PHX @ GSW	0	-0.237	13
21700071	San Antonio Spurs	SAS @ ORL	0	-0.234	19
21700221	New York Knicks	NYK @ TOR	0	-0.234	17
21700306	Miami Heat	MIA @ NYK	0	-0.222	21
...	...	...	...	...	...
21700514	Golden State Warriors	GSW vs. UTA	1	0.191	37
21700280	Toronto Raptors	TOR @ ATL	1	0.201	31
21700276	Golden State Warriors	GSW vs. CHI	1	0.228	36
21700349	Utah Jazz	UTA vs. WAS	1	0.278	29
21700204	Atlanta Hawks	ATL vs. SAC	1	0.281	40

Logistic Regression II¶

Decision Boundaries¶

Linear Separability¶

Performance Metrics¶

Confusion matrix¶

Precision and Recall¶

True and False Positive Rates¶

Adjusting the Classification Threshold¶

Thresholds and Performance Metrics¶

Precision-Recall Curves¶

ROC Curves¶