import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


nba = pd.read_csv('data/nba18-19.csv', index_col=0)
nba.index.name = None # Drops name of index (players are ordered by rank)


nba.head(5)


nba[['FG', 'AST', '3PA', 'PTS']]


X = nba[['FG', 'AST', '3PA']]
X.insert(0, 'Bias', 1)
X


X = X.to_numpy()
X.shape

(708, 4)


# For nba data
Y = nba[["PTS"]].to_numpy()
n = len(Y)
print("number datapoints", n)
Y[:5]

number datapoints 708

array([[ 5.3],
       [ 1.7],
       [ 3.2],
       [13.9],
       [ 8.9]])


theta_arbitrary = np.array([[0.5], [-1.14], [0.65], [1.52]])
theta_arbitrary

array([[ 0.5 ],
       [-1.14],
       [ 0.65],
       [ 1.52]])


display((X @ theta_arbitrary)[:5])

display(Y[:5])

array([[ 5.07 ],
       [ 2.844],
       [ 3.825],
       [-5.3  ],
       [-1.642]])

array([[ 5.3],
       [ 1.7],
       [ 3.2],
       [13.9],
       [ 8.9]])


theta_arbitrary = np.array([0.5, -1.14, 0.65, 1.52])
def mse_nba(theta):
    """
    Y is PTS
    X is intercept, FG, AST, 3PA
    """
    return (1/n) * (np.linalg.norm(Y - X @ theta) ** 2)

mse_nba(theta_arbitrary)

57637.9018454746


from numpy.linalg import inv


def least_squares_estimate(X, Y):
    return inv(X.T @ X) @ X.T @ Y

theta_hat = least_squares_estimate(X, Y)
theta_hat

array([[-0.29253798],
       [ 2.51705703],
       [ 0.05075571],
       [ 0.31307653]])


print("Arbitrary theta MSE: ", mse_nba(theta_arbitrary))
print("Optimal theta MSE:"  , mse_nba(theta_hat))

Arbitrary theta MSE:  57637.9018454746
Optimal theta MSE: 0.3963133329080335


Y_hat = X @ theta_hat


sns.scatterplot(x = Y_hat.flatten(), y = (Y - Y_hat).flatten())
sns.lineplot(x=[-1, 35], y=[0,0], label="$\hat{Y}=Y$", color = 'orange', linewidth = 3)
plt.xlabel("$\hat{Y}$")
plt.ylabel("$\hat{Y} - Y$")
plt.xlim([0, 32])
plt.show()


r2_ast_fg_3pa = np.var(Y_hat) / np.var(Y)
r2_ast_fg_3pa

0.9883162128703274


# Use intercept, AST, 3PA
X_3d = nba[['AST', '3PA']]
X_3d.insert(0, 'Bias', 1)
X_3d = X_3d.to_numpy()

theta_ast_3pa = least_squares_estimate(X_3d, Y)
Y_hat_ast_3pa = X_3d @ theta_ast_3pa

r2_ast_3pa = np.var(Y_hat_ast_3pa) / np.var(Y)
r2_ast_3pa

0.608786276366571


# Use intercept, AST only (SLR)
X_slr = nba[['AST']]
X_slr.insert(0, 'Bias', 1)
X_slr = X_slr.to_numpy()

theta_ast_only = least_squares_estimate(X_slr, Y)
Y_hat_ast_only = X_slr @ theta_ast_only

r2_ast_only = np.var(Y_hat_ast_only) / np.var(Y)
r2_ast_only

0.4570055507968593


theta_ast_only

array([[3.98332315],
       [2.39888152]])


theta_hat

array([[-0.29253798],
       [ 2.51705703],
       [ 0.05075571],
       [ 0.31307653]])


print("(SLR) intercept, AST:    ", r2_ast_only)
print("intercept, 3PA, AST:     ", r2_ast_3pa)
print("intercept, FG, 3PA, AST: ", r2_ast_fg_3pa)

(SLR) intercept, AST:     0.4570055507968593
intercept, 3PA, AST:      0.608786276366571
intercept, FG, 3PA, AST:  0.9883162128703274


r = np.corrcoef(nba['AST'], nba['PTS'])[0,1]
r ** 2

0.4570055507968595

	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	FG%	...	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS
1	Álex Abrines\abrinal01	SG	25	OKC	31	2	19.0	1.8	5.1	0.357	...	0.923	0.2	1.4	1.5	0.6	0.5	0.2	0.5	1.7	5.3
2	Quincy Acy\acyqu01	PF	28	PHO	10	0	12.3	0.4	1.8	0.222	...	0.700	0.3	2.2	2.5	0.8	0.1	0.4	0.4	2.4	1.7
3	Jaylen Adams\adamsja01	PG	22	ATL	34	1	12.6	1.1	3.2	0.345	...	0.778	0.3	1.4	1.8	1.9	0.4	0.1	0.8	1.3	3.2
4	Steven Adams\adamsst01	C	25	OKC	80	80	33.4	6.0	10.1	0.595	...	0.500	4.9	4.6	9.5	1.6	1.5	1.0	1.7	2.6	13.9
5	Bam Adebayo\adebaba01	C	21	MIA	82	28	23.3	3.4	5.9	0.576	...	0.735	2.0	5.3	7.3	2.2	0.9	0.8	1.5	2.5	8.9

	FG	AST	3PA	PTS
1	1.8	0.6	4.1	5.3
2	0.4	0.8	1.5	1.7
3	1.1	1.9	2.2	3.2
4	6.0	1.6	0.0	13.9
5	3.4	2.2	0.2	8.9
...	...	...	...	...
528	4.0	0.8	0.0	11.5
529	3.1	0.9	0.0	7.8
530	3.6	1.1	0.0	8.9
530	3.4	0.8	0.0	8.5
530	3.8	1.5	0.0	9.4

	Bias	FG	AST	3PA
1	1	1.8	0.6	4.1
2	1	0.4	0.8	1.5
3	1	1.1	1.9	2.2
4	1	6.0	1.6	0.0
5	1	3.4	2.2	0.2
...	...	...	...	...
528	1	4.0	0.8	0.0
529	1	3.1	0.9	0.0
530	1	3.6	1.1	0.0
530	1	3.4	0.8	0.0
530	1	3.8	1.5	0.0

Lecture 12 – Data 100, Fall 2023¶

Multiple Linear Regression¶

Example prediction¶

Computing MSE¶

Implementing Least Squares¶

Making Least Squares Predictions¶

Model Performance/Diagnosing the Model¶

Multiple $R^2$¶

	Bias	FG	AST	3PA
1	1	1.8	0.6	4.1
2	1	0.4	0.8	1.5
3	1	1.1	1.9	2.2
4	1	6.0	1.6	0.0
5	1	3.4	2.2	0.2
...	...	...	...	...
528	1	4.0	0.8	0.0
529	1	3.1	0.9	0.0
530	1	3.6	1.1	0.0
530	1	3.4	0.8	0.0
530	1	3.8	1.5	0.0

	Bias	FG	AST	3PA
1	1	1.8	0.6	4.1
2	1	0.4	0.8	1.5
3	1	1.1	1.9	2.2
4	1	6.0	1.6	0.0
5	1	3.4	2.2	0.2
...	...	...	...	...
528	1	4.0	0.8	0.0
529	1	3.1	0.9	0.0
530	1	3.6	1.1	0.0
530	1	3.4	0.8	0.0
530	1	3.8	1.5	0.0

	Bias	FG	AST	3PA
1	1	1.8	0.6	4.1
2	1	0.4	0.8	1.5
3	1	1.1	1.9	2.2
4	1	6.0	1.6	0.0
5	1	3.4	2.2	0.2
...	...	...	...	...
528	1	4.0	0.8	0.0
529	1	3.1	0.9	0.0
530	1	3.6	1.1	0.0
530	1	3.4	0.8	0.0
530	1	3.8	1.5	0.0