import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
data = sns.load_dataset('mpg')
data.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
fig = px.scatter(data, x="displacement", y="mpg", trendline="ols")
fig
model = px.get_trendline_results(fig).iloc[0][0]
model.summary()
Dep. Variable: | y | R-squared: | 0.647 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.646 |
Method: | Least Squares | F-statistic: | 725.0 |
Date: | Tue, 02 Mar 2021 | Prob (F-statistic): | 1.66e-91 |
Time: | 14:50:54 | Log-Likelihood: | -1175.5 |
No. Observations: | 398 | AIC: | 2355. |
Df Residuals: | 396 | BIC: | 2363. |
Df Model: | 1 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | 35.1748 | 0.492 | 71.519 | 0.000 | 34.208 | 36.142 |
x1 | -0.0603 | 0.002 | -26.926 | 0.000 | -0.065 | -0.056 |
Omnibus: | 41.373 | Durbin-Watson: | 1.394 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 60.024 |
Skew: | 0.711 | Prob(JB): | 9.24e-14 |
Kurtosis: | 4.264 | Cond. No. | 463. |
X = data[ ['displacement'] ]
y = data[ ['mpg'] ]
X.head()
displacement | |
---|---|
0 | 307.0 |
1 | 350.0 |
2 | 318.0 |
3 | 304.0 |
4 | 302.0 |
Adding the constant term:
data["constant"] = 1
X = data[['displacement', "constant"]]
X.head()
displacement | constant | |
---|---|---|
0 | 307.0 | 1 |
1 | 350.0 | 1 |
2 | 318.0 | 1 |
3 | 304.0 | 1 |
4 | 302.0 | 1 |
X.head()
displacement | constant | |
---|---|---|
0 | 307.0 | 1 |
1 | 350.0 | 1 |
2 | 318.0 | 1 |
3 | 304.0 | 1 |
4 | 302.0 | 1 |
X.T @ X
displacement | constant | |
---|---|---|
displacement | 19206864.25 | 76983.5 |
constant | 76983.50 | 398.0 |
X.T @ y
mpg | |
---|---|
displacement | 1550039.4 |
constant | 9358.8 |
from numpy.linalg import inv, solve
The exact math from lecture.
inv(X.T @ X) @ (X.T @ y)
mpg | |
---|---|
0 | -0.060282 |
1 | 35.174750 |
More numerically stable and computationally efficient.
solve(X.T @ X, X.T @ y)
array([[-0.06028241], [35.17475015]])
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=False)
model.fit(X, y)
print(model.coef_)
[[-0.06028241 35.17475015]]
theta = solve(X.T @ X, X.T @ y)
theta
array([[-0.06028241], [35.17475015]])
data['yhat'] = (X @ theta)
data.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | constant | yhat | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu | 1 | 16.668052 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 | 1 | 14.075908 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite | 1 | 16.004945 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst | 1 | 16.848899 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino | 1 | 16.969464 |
data['yhat'] = model.predict(X)
data.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | constant | yhat | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu | 1 | 16.668052 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 | 1 | 14.075908 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite | 1 | 16.004945 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst | 1 | 16.848899 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino | 1 | 16.969464 |
data["residual"] = data["mpg"] - data["yhat"]
fig = px.scatter(data, x="displacement", y="residual")
fig.add_trace(go.Scatter(x=[50, 475], y=[0,0], name = "Model"))
data['residual'].sum()
-1.7053025658242404e-13
np.sqrt(np.mean(data['residual']**2))
4.6396293441245815
data["residual"].abs().mean()
3.526327374875378
data.head()
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | constant | yhat | residual | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu | 1 | 16.668052 | 1.331948 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 | 1 | 14.075908 | 0.924092 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite | 1 | 16.004945 | 1.995055 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst | 1 | 16.848899 | -0.848899 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino | 1 | 16.969464 | 0.030536 |
model2 = LinearRegression(fit_intercept=True)
features = ["cylinders", "displacement", "weight", "model_year", "acceleration"]
model2.fit(data[features], data[['mpg']])
print(model2.coef_)
[[-0.25858516 0.00726771 -0.00692571 0.75530084 0.08034746]]
data['yhat2'] = model2.predict(data[features])
What can we say about the magnitudes of the weights?
px.histogram(data,
x = ["cylinders", "displacement", "weight"],
barmode="overlay",
marginal="box")
Rescaling the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Z = scaler.fit_transform(data[features])
Z
array([[ 1.49819126, 1.0906037 , 0.63086987, -1.62742629, -1.29549834], [ 1.49819126, 1.5035143 , 0.85433297, -1.62742629, -1.47703779], [ 1.49819126, 1.19623199, 0.55047045, -1.62742629, -1.65857724], ..., [-0.85632057, -0.56103873, -0.79858454, 1.62198339, -1.4407299 ], [-0.85632057, -0.70507731, -0.40841088, 1.62198339, 1.10082237], [-0.85632057, -0.71467988, -0.29608816, 1.62198339, 1.39128549]])
px.histogram(x = [Z[:,0], Z[:,1], Z[:,2]],
barmode="overlay")
model3 = LinearRegression(fit_intercept=True)
model3.fit(Z, data[['mpg']])
print(model3.coef_)
data['yhat3'] = model3.predict(Z)
[[-0.43930153 0.75684991 -5.85760433 2.78930976 0.22129477]]
features
['cylinders', 'displacement', 'weight', 'model_year', 'acceleration']
data
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | constant | yhat | residual | yhat2 | yhat3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu | 1 | 16.668052 | 1.331948 | 15.160369 | 15.160369 |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 | 1 | 14.075908 | 0.924092 | 14.123749 | 14.123749 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite | 1 | 16.004945 | 1.995055 | 15.630915 | 15.630915 |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst | 1 | 16.848899 | -0.848899 | 15.630291 | 15.630291 |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino | 1 | 16.969464 | 0.030536 | 15.384423 | 15.384423 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl | 1 | 26.735213 | 0.264787 | 29.278818 | 29.278818 |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup | 1 | 29.327357 | 14.672643 | 34.260400 | 34.260400 |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage | 1 | 27.036625 | 4.963375 | 32.349314 | 32.349314 |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger | 1 | 27.940861 | 0.059139 | 30.517248 | 30.517248 |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 | 1 | 28.001144 | 2.998856 | 29.916316 | 29.916316 |
398 rows × 14 columns
data['residual2'] = data['mpg'] - data['yhat2']
np.sqrt(np.mean(data['residual2']**2))
3.4143567605602265
from sklearn.metrics import r2_score
r2_score(data["mpg"], data["yhat"])
0.646742183425786
r2_score(data["mpg"], data["yhat2"])
0.8086876515237436