by Suraj Rampure
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from scipy.optimize import minimize
import sklearn.linear_model as lm
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3
sns.set()
So far, our logistic regression model predicts probabilities. But we originally set out on a mission to create a classifier. How can we use our predicted probabilities to create classifications?
Let's get back the NBA data we had last time.
df = pd.read_csv('nba.csv')
df["WON"] = df["WL"]
df["WON"] = df["WON"].replace("W", 1)
df["WON"] = df["WON"].replace("L", 0)
one_team = df.groupby("GAME_ID").first()
opponent = df.groupby("GAME_ID").last()
games = one_team.merge(opponent, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games["FG_PCT_DIFF"] = games["FG_PCT"] - games["FG_PCT_OPP"]
games["PF_DIFF"] = games["PF"] - games["PF_OPP"]
games['WON'] = games['WL'].replace('L', 0).replace('W', 1)
games = games[['TEAM_NAME', 'MATCHUP', 'WON', 'FG_PCT_DIFF', 'PF_DIFF']]
games.head()
TEAM_NAME | MATCHUP | WON | FG_PCT_DIFF | PF_DIFF | |
---|---|---|---|---|---|
GAME_ID | |||||
21700001 | Boston Celtics | BOS @ CLE | 0 | -0.049 | -1 |
21700002 | Golden State Warriors | GSW vs. HOU | 0 | 0.053 | 9 |
21700003 | Charlotte Hornets | CHA @ DET | 0 | -0.030 | -6 |
21700004 | Indiana Pacers | IND vs. BKN | 1 | 0.041 | 0 |
21700005 | Orlando Magic | ORL vs. MIA | 1 | 0.042 | -2 |
Let's call this model basic_model
since it only has one feature. (Eventually, we will use more features.)
It is the same model we fit in the last lecture.
basic_model = lm.LogisticRegression(penalty = 'none', fit_intercept = False, solver = 'lbfgs')
basic_model.fit(games[['FG_PCT_DIFF']], games['WON'])
LogisticRegression(fit_intercept=False, penalty='none')
As before, we can use .predict_proba
to get the predicted probabilities for each class under our logistic regression model.
basic_model.predict_proba(games[['FG_PCT_DIFF']])
array([[0.81733506, 0.18266494], [0.16510648, 0.83489352], [0.71450899, 0.28549101], ..., [0.37288695, 0.62711305], [0.94003495, 0.05996505], [0.95102413, 0.04897587]])
We can plot our model, too:
x_sorted = np.array(games['FG_PCT_DIFF'].sort_values()).reshape(len(games), 1)
basic_model_ps_sorted = basic_model.predict_proba(x_sorted)[:, 1]
points = go.Scatter(name = 'true observations',
x = games['FG_PCT_DIFF'],
y = games['WON'],
mode = 'markers',
marker={'opacity':0.5})
lr_line = go.Scatter(name = 'Logistic Regression model',
x = x_sorted.flatten(),
y = basic_model_ps_sorted)
fig = go.Figure([points, lr_line])
fig