import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm

import plotly.express as px
import plotly.graph_objects as go

import seaborn as sns


nba = pd.read_csv('nba18-19.csv')

nba


model = lm.LinearRegression(fit_intercept = True)

model.fit(nba[['AST', '3PA']], nba['PTS']);


model.coef_

array([1.64065507, 1.25758096])


model.intercept_

2.156347047514835


X = nba[['AST', '3PA']]
y = nba['PTS']


X.shape

(708, 2)


X.loc[:, 'bias'] = 1

# This line reorders our columns so that column i corresponds to theta_i.
# By default, newly added columns to go to the end of a DataFrame.
# This is not at all necessary; it just makes our resulting theta vector follow a nice order.
X = X[['bias', 'AST', '3PA']]

/opt/conda/lib/python3.8/site-packages/pandas/core/indexing.py:1596: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
/opt/conda/lib/python3.8/site-packages/pandas/core/indexing.py:1719: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)

X


def least_squares(X, y):
    return np.linalg.inv(X.T @ X) @ X.T @ y


theta = least_squares(X, y)


theta

0    2.156347
1    1.640655
2    1.257581
dtype: float64


ans = sns.load_dataset('anscombe')


ans1 = ans[ans['dataset'] == 'I']
ans2 = ans[ans['dataset'] == 'II']


plt.scatter(ans1['x'], ans1['y']);


plt.scatter(ans2['x'], ans2['y']);


slr_ans_1 = lm.LinearRegression()
slr_ans_1.fit(ans1[['x']], ans1['y'])
y_pred_ans_1 = slr_ans_1.predict(ans1[['x']])
residual_ans_1 = ans1['y'] - y_pred_ans_1


plt.scatter(ans1['x'], residual_ans_1, color = 'red');
plt.plot([4, 14], [0, 0], color = 'black')
plt.xlabel(r'$x$')
plt.ylabel('residual');
plt.title(r'Residual vs. $x$ for dataset 1');


slr_ans_2 = lm.LinearRegression()
slr_ans_2.fit(ans2[['x']], ans2['y'])
y_pred_ans_2 = slr_ans_2.predict(ans2[['x']])
residual_ans_2 = ans2['y'] - y_pred_ans_2


plt.scatter(ans2['x'], residual_ans_2, color = 'red');
plt.plot([4, 14], [0, 0], color = 'black')
plt.xlabel(r'$x$')
plt.ylabel('residual');
plt.title(r'Residual vs. $x$ for dataset 2');


residual_ans_1.sum()

-1.865174681370263e-14


residual_ans_2.sum()

1.021405182655144e-14


nba_model = lm.LinearRegression()
nba_model.fit(nba[['AST', '3PA']], nba['PTS'])
pts_pred = nba_model.predict(nba[['AST', '3PA']])
residual_nba = nba['PTS'] - pts_pred


fig = go.Figure()

data_scatter = go.Scatter(x = pts_pred, y = residual_nba,
                         mode = 'markers',
                         marker = dict(color = 'red', size = 8), name = 'residuals vs. fitted values')

fig.add_trace(data_scatter)
fig.add_trace(go.Scatter(x = [0, 35], y = [0, 0], name = 'residuals = 0', marker = dict(color = 'black')))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0), 
                  height=400,
                 xaxis_title=r"Fitted Values",
                 yaxis_title="Residuals")

fig

	Rk	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	...	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS
0	1	Álex Abrines\abrinal01	SG	25	OKC	31	2	19.0	1.8	5.1	...	0.923	0.2	1.4	1.5	0.6	0.5	0.2	0.5	1.7	5.3
1	2	Quincy Acy\acyqu01	PF	28	PHO	10	0	12.3	0.4	1.8	...	0.700	0.3	2.2	2.5	0.8	0.1	0.4	0.4	2.4	1.7
2	3	Jaylen Adams\adamsja01	PG	22	ATL	34	1	12.6	1.1	3.2	...	0.778	0.3	1.4	1.8	1.9	0.4	0.1	0.8	1.3	3.2
3	4	Steven Adams\adamsst01	C	25	OKC	80	80	33.4	6.0	10.1	...	0.500	4.9	4.6	9.5	1.6	1.5	1.0	1.7	2.6	13.9
4	5	Bam Adebayo\adebaba01	C	21	MIA	82	28	23.3	3.4	5.9	...	0.735	2.0	5.3	7.3	2.2	0.9	0.8	1.5	2.5	8.9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
703	528	Tyler Zeller\zellety01	C	29	MEM	4	1	20.5	4.0	7.0	...	0.778	2.3	2.3	4.5	0.8	0.3	0.8	1.0	4.0	11.5
704	529	Ante Žižić\zizican01	C	22	CLE	59	25	18.3	3.1	5.6	...	0.705	1.8	3.6	5.4	0.9	0.2	0.4	1.0	1.9	7.8
705	530	Ivica Zubac\zubaciv01	C	21	TOT	59	37	17.6	3.6	6.4	...	0.802	1.9	4.2	6.1	1.1	0.2	0.9	1.2	2.3	8.9
706	530	Ivica Zubac\zubaciv01	C	21	LAL	33	12	15.6	3.4	5.8	...	0.864	1.6	3.3	4.9	0.8	0.1	0.8	1.0	2.2	8.5
707	530	Ivica Zubac\zubaciv01	C	21	LAC	26	25	20.2	3.8	7.2	...	0.733	2.3	5.3	7.7	1.5	0.4	0.9	1.4	2.5	9.4

	bias	AST	3PA
0	1	0.6	4.1
1	1	0.8	1.5
2	1	1.9	2.2
3	1	1.6	0.0
4	1	2.2	0.2
...	...	...	...
703	1	0.8	0.0
704	1	0.9	0.0
705	1	1.1	0.0
706	1	0.8	0.0
707	1	1.5	0.0

Lecture 13 – Data 100, Summer 2021¶

Recap from Lecture 12¶

Using `np.linalg.inv` to solve for optimal $\hat{\theta}$¶

Residual plots¶

	bias	AST	3PA
0	1	0.6	4.1
1	1	0.8	1.5
2	1	1.9	2.2
3	1	1.6	0.0
4	1	2.2	0.2
...	...	...	...
703	1	0.8	0.0
704	1	0.9	0.0
705	1	1.1	0.0
706	1	0.8	0.0
707	1	1.5	0.0

	bias	AST	3PA
0	1	0.6	4.1
1	1	0.8	1.5
2	1	1.9	2.2
3	1	1.6	0.0
4	1	2.2	0.2
...	...	...	...
703	1	0.8	0.0
704	1	0.9	0.0
705	1	1.1	0.0
706	1	0.8	0.0
707	1	1.5	0.0

Lecture 13 – Data 100, Summer 2021¶

Recap from Lecture 12¶

Using np.linalg.inv to solve for optimal $\hat{\theta}$¶

Residual plots¶

Using `np.linalg.inv` to solve for optimal $\hat{\theta}$¶

	bias	AST	3PA
0	1	0.6	4.1
1	1	0.8	1.5
2	1	1.9	2.2
3	1	1.6	0.0
4	1	2.2	0.2
...	...	...	...
703	1	0.8	0.0
704	1	0.9	0.0
705	1	1.1	0.0
706	1	0.8	0.0
707	1	1.5	0.0