import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go


nba = pd.read_csv('nba18-19.csv', index_col=0)
nba.index.name = None # drops name of index (players are ordered by rank)

nba


nba[['FG', 'AST', '3PA', 'PTS']]


X = nba[['FG', 'AST', '3PA']]
X.insert(0, 'Bias', 1)
X = X.to_numpy()
X

array([[1. , 1.8, 0.6, 4.1],
       [1. , 0.4, 0.8, 1.5],
       [1. , 1.1, 1.9, 2.2],
       ...,
       [1. , 3.6, 1.1, 0. ],
       [1. , 3.4, 0.8, 0. ],
       [1. , 3.8, 1.5, 0. ]])


X.shape

(708, 4)


# for nba data
Y = nba[["PTS"]].to_numpy()
n = len(Y)
print("number datapoints", n)

number datapoints 708


theta_arbitrary = np.array([[0.5], [-1.14], [0.65], [1.52]])
theta_arbitrary

array([[ 0.5 ],
       [-1.14],
       [ 0.65],
       [ 1.52]])


X @ theta_arbitrary

[[ 5.0700e+00]
 [ 2.8440e+00]
 [ 3.8250e+00]
 [-5.3000e+00]
 [-1.6420e+00]
 ...
 [-2.4490e+00]
 [-2.8890e+00]
 [-2.8560e+00]
 [-2.8570e+00]]


def mse_nba(theta):
    """
    Y is PTS
    X is intercept, FG, AST, 3PA
    """
    return (1/n) * (np.linalg.norm(Y - X @ theta) ** 2)

mse_nba(theta_arbitrary)

76.52265605508474


from numpy.linalg import inv


def least_squares_estimate(X, Y):
    return inv(X.T @ X) @ X.T @ Y

theta_hat = least_squares_estimate(X, Y)
theta_hat

array([[-0.29253798],
       [ 2.51705703],
       [ 0.05075571],
       [ 0.31307653]])


print("Arbitrary theta MSE: ", mse_nba(theta_arbitrary))
print("Optimal theta MSE:"  , mse_nba(theta_hat))

Arbitrary theta MSE:  76.52265605508474
Optimal theta MSE: 0.3963133329080335


Y_hat = X @ theta_hat


fig = px.scatter(x = Y_hat.flatten(), y = (Y - Y_hat).flatten(), opacity=0.2)
fig.add_trace(go.Scatter(x=[0, 30], y=[0,0], name="Y_hat=Y"))
fig.update_xaxes(title="Y_hat")
fig.update_yaxes(title="Y_hat - Y")
fig


r2_ast_fg_3pa = np.var(Y_hat) / np.var(Y)
r2_ast_fg_3pa

0.9883162128703274


# use intercept, ast, 3pa
X_3d = nba[['AST', '3PA']]
X_3d.insert(0, 'Bias', 1)
X_3d = X_3d.to_numpy()

theta_ast_3pa = least_squares_estimate(X_3d, Y)
Y_hat_ast_3pa = X_3d @ theta_ast_3pa

r2_ast_3pa = np.var(Y_hat_ast_3pa) / np.var(Y)
r2_ast_3pa

0.608786276366571


# use intercept, ast only (SLR)
X_slr = nba[['AST']]
X_slr.insert(0, 'Bias', 1)
X_slr = X_slr.to_numpy()

theta_ast_only = least_squares_estimate(X_slr, Y)
Y_hat_ast_only = X_slr @ theta_ast_only

r2_ast_only = np.var(Y_hat_ast_only) / np.var(Y)
r2_ast_only

0.4570055507968593


print("(SLR) interecept, AST:   ", r2_ast_only)
print("intercept, 3PA, AST:     ", r2_ast_3pa)
print("intercept, FG, 3PA, AST: ", r2_ast_fg_3pa)

(SLR) interecept, AST:    0.4570055507968593
intercept, 3PA, AST:      0.608786276366571
intercept, FG, 3PA, AST:  0.9883162128703274


r = np.corrcoef(nba['AST'], nba['PTS'])[0,1]
r ** 2

0.4570055507968595

	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	FG%	...	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS
1	Álex Abrines\abrinal01	SG	25	OKC	31	2	19.0	1.8	5.1	0.357	...	0.923	0.2	1.4	1.5	0.6	0.5	0.2	0.5	1.7	5.3
2	Quincy Acy\acyqu01	PF	28	PHO	10	0	12.3	0.4	1.8	0.222	...	0.700	0.3	2.2	2.5	0.8	0.1	0.4	0.4	2.4	1.7
3	Jaylen Adams\adamsja01	PG	22	ATL	34	1	12.6	1.1	3.2	0.345	...	0.778	0.3	1.4	1.8	1.9	0.4	0.1	0.8	1.3	3.2
4	Steven Adams\adamsst01	C	25	OKC	80	80	33.4	6.0	10.1	0.595	...	0.500	4.9	4.6	9.5	1.6	1.5	1.0	1.7	2.6	13.9
5	Bam Adebayo\adebaba01	C	21	MIA	82	28	23.3	3.4	5.9	0.576	...	0.735	2.0	5.3	7.3	2.2	0.9	0.8	1.5	2.5	8.9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
528	Tyler Zeller\zellety01	C	29	MEM	4	1	20.5	4.0	7.0	0.571	...	0.778	2.3	2.3	4.5	0.8	0.3	0.8	1.0	4.0	11.5
529	Ante Žižić\zizican01	C	22	CLE	59	25	18.3	3.1	5.6	0.553	...	0.705	1.8	3.6	5.4	0.9	0.2	0.4	1.0	1.9	7.8
530	Ivica Zubac\zubaciv01	C	21	TOT	59	37	17.6	3.6	6.4	0.559	...	0.802	1.9	4.2	6.1	1.1	0.2	0.9	1.2	2.3	8.9
530	Ivica Zubac\zubaciv01	C	21	LAL	33	12	15.6	3.4	5.8	0.580	...	0.864	1.6	3.3	4.9	0.8	0.1	0.8	1.0	2.2	8.5
530	Ivica Zubac\zubaciv01	C	21	LAC	26	25	20.2	3.8	7.2	0.538	...	0.733	2.3	5.3	7.7	1.5	0.4	0.9	1.4	2.5	9.4

Lecture 11 – Ordinary Least Squares¶

Multiple Linear Regression¶

Example prediction¶

Computing MSE¶

Implementing Least Squares¶

Making Least Squares Predictions¶

Model Performance/Diagnosing the Model¶

Multiple $R^2$¶