import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import plotly.graph_objects as go
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')
In this lecture, we will use the mpg
dataset from the seaborn
library. The dataset has 392 rows and 9 column. In this demo we will try to use some of the columns and their transformations to predict the value of the mpg
column.
Let's load the dataset first and plot mpg
with respect to the hp
column.
np.random.seed(319)
# Load the fuel dataset, and drop any rows that have missing data
vehicle_data = sns.load_dataset('mpg').dropna()
vehicle_data = vehicle_data.rename(columns={"horsepower": "hp"})
px.scatter(vehicle_data, x="hp", y="mpg")
Let's also look at different columns in the DataFrame
.
vehicle_data
mpg | cylinders | displacement | hp | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 |
392 rows × 9 columns
mpg
¶We select 35 samples from the vehicle_data
and we will focus on this sample to build linear regression models with different polynomial degrees (degrees 0 to 6 will be explored here) from the hp
feature. For example, a degree 4 polynomial based on the hp
feature to predict mpg
can be characterized as follows:
$\widehat{mpg} = \theta_0 + \theta_1 hp + \theta_2 hp^2 + \theta_3 hp^3 + \theta_4 hp^4$
# Selecting a set of 35 samples
np.random.seed(5)
vehicle_data_sample_35 = vehicle_data.sample(35)
# Plotting mpg vs hp to observe the trend in the data
fig = px.scatter(vehicle_data_sample_35, x="hp", y="mpg")
# fig.write_image("35_vehicles.png")
fig.show()
# Building a pipeline for:
# 1) Transforming 'hp' to polynomial features of degrees 0 up to k (inclusive)
# 2) Fitting a linear regression model to the vehicle_data_sample_35
# 'get_MSE_for_degree_k_model(k)' returns MSE for a degree k polynomial fit to the data
def get_MSE_for_degree_k_model(k):
pipelined_model = Pipeline([
('transform', PolynomialFeatures(degree=k)),
('regression', LinearRegression(fit_intercept=True))
])
pipelined_model.fit(vehicle_data_sample_35[["hp"]], vehicle_data_sample_35["mpg"])
return mean_squared_error(pipelined_model.predict(vehicle_data_sample_35[["hp"]]), vehicle_data_sample_35["mpg"])
# Running the pipeline for polynomial degrees 0 up to 7.
ks = np.array(range(0, 7))
MSEs = [get_MSE_for_degree_k_model(k) for k in ks]
MSEs_and_k = pd.DataFrame({"k": ks, "MSE": MSEs})
MSEs_and_k
k | MSE | |
---|---|---|
0 | 0 | 72.091396 |
1 | 1 | 28.002727 |
2 | 2 | 25.835769 |
3 | 3 | 25.831592 |
4 | 4 | 25.763052 |
5 | 5 | 25.609403 |
6 | 6 | 23.251351 |
# Plotting MSE vs polynomial degree k
fig = px.line(MSEs_and_k, x="k", y="MSE")
fig.update_layout(font_size=15,
xaxis_title="Polynomial Degree (k)",
yaxis_title="Mean Square Error",
margin=dict(l=50, r=50, b=0, t=1))
fig.show()
# fig.write_image("MSE_vs_k.png")
# Plotting degree k polynomial fit to the data
def plot_degree_k_model(k):
pipelined_model = Pipeline([
('transform', PolynomialFeatures(degree=k)),
('regression', LinearRegression(fit_intercept=True))
])
pipelined_model.fit(vehicle_data_sample_35[["hp"]], vehicle_data_sample_35["mpg"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=vehicle_data_sample_35['hp'], y=vehicle_data_sample_35['mpg'],
mode="markers", name=""))
x_range = np.linspace(45, 210, 100)
fig.add_trace(go.Scatter(x=x_range, y=pipelined_model.predict(x_range.reshape(-1, 1)),
mode = "lines", name = ""))
fig.update_layout(font_size=12,
xaxis_title="hp",
yaxis_title="mpg",
showlegend=False)
return fig
for i in range(7):
fig = plot_degree_k_model(i)
fig.update_layout(title=f"degree {i} polynomial")
fig.show()
# fig.write_image(f"degree {i} polynomial.png")
The code below splits our data into two sets of size 25 and 10.
from sklearn.utils import shuffle
np.random.seed(320)
# Splitting the data into training and validation set
training_set, validation_set = np.split(shuffle(vehicle_data_sample_35), [25])
# Training LR models with different degree polynomial features on training data
def get_MSE_for_degree_k_model_training_set(k):
pipelined_model = Pipeline([
('transform', PolynomialFeatures(degree=k)),
('regression', LinearRegression(fit_intercept=True))
])
pipelined_model.fit(training_set[["hp"]], training_set["mpg"])
return mean_squared_error(pipelined_model.predict(training_set[["hp"]]), training_set["mpg"])
# Plotting MSE vs polynomial degree k
ks = np.array(range(0, 7))
MSEs = [get_MSE_for_degree_k_model_training_set(k) for k in ks]
MSEs_and_k = pd.DataFrame({"k": ks, "MSE": MSEs})
fig = px.line(MSEs_and_k, x="k", y="MSE")
fig.update_layout(font_size=15,
xaxis_title="Polynomial Degree (k)",
yaxis_title="Mean Square Error",
margin=dict(l=50, r=50, b=0, t=1))
fig.show()
# fig.write_image("MSE_vs_k_training_set.png")
# MSE of models with different degree polynomials fitted on the training data
MSEs_and_k
k | MSE | |
---|---|---|
0 | 0 | 60.235744 |
1 | 1 | 30.756678 |
2 | 2 | 29.875269 |
3 | 3 | 29.180868 |
4 | 4 | 28.214850 |
5 | 5 | 25.290990 |
6 | 6 | 23.571025 |
# Plotting degree k polynomial fit to the training data
def plot_degree_k_model_on_training_set(k):
pipelined_model = Pipeline([
('transform', PolynomialFeatures(degree=k)),
('regression', LinearRegression(fit_intercept=True))
])
pipelined_model.fit(training_set[["hp"]], training_set["mpg"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=training_set['hp'], y=training_set['mpg'],
mode="markers", name=""))
x_range = np.linspace(45, 210, 100)
fig.add_trace(go.Scatter(x=x_range, y=pipelined_model.predict(x_range.reshape(-1, 1)),
mode = "lines", name = ""))
fig.update_layout(font_size=12,
xaxis_title="hp",
yaxis_title="mpg",
showlegend = False)
return fig
for i in range(7):
fig = plot_degree_k_model_on_training_set(i)
fig.update_layout(title=f"degree {i} polynomial fitted to the training data")
fig.show()
# fig.write_image(f"degree {i} polynomial on training only.png")
# Plotting degree k polynomial fit to the training data along with the validation set
def plot_degree_k_model_on_training_set_and_val_set(k):
pipelined_model = Pipeline([
('transform', PolynomialFeatures(degree=k)),
('regression', LinearRegression(fit_intercept=True))
])
pipelined_model.fit(training_set[["hp"]], training_set["mpg"])
fig = go.Figure()
fig.add_trace(go.Scatter(x=training_set['hp'], y=training_set['mpg'],
mode="markers", name="training set"))
fig.add_trace(go.Scatter(x=validation_set['hp'], y=validation_set['mpg'],
mode = "markers", name = "dev set", marker=dict(
color='Orange')))
x_range = np.linspace(45, 210, 100)
fig.add_trace(go.Scatter(x=x_range, y=pipelined_model.predict(x_range.reshape(-1, 1)),
mode="lines", name="model prediction", line=dict(color='red')))
fig.update_layout(font_size=12,
xaxis_title="hp",
yaxis_title="mpg",
showlegend=False)
return fig
for i in range(7):
fig = plot_degree_k_model_on_training_set_and_val_set(i)
fig.update_layout(title=f"degree {i} polynomial training/validation outcomes")
fig.show()
# fig.write_image(f"degree {i} polynomial training-dev outcomes.png")
# Calculating MSE for training and validation sets
# based on different degrees of polynomial
def get_training_and_val_MSE_for_degree_k_model(k, training, validation):
pipelined_model = Pipeline([
('transform', PolynomialFeatures(degree=k)),
('regression', LinearRegression(fit_intercept=True))
])
pipelined_model.fit(training[["hp"]], training["mpg"])
training_error = mean_squared_error(pipelined_model.predict(training[["hp"]]), training["mpg"])
validation_error = mean_squared_error(pipelined_model.predict(validation[["hp"]]), validation["mpg"])
return [k, training_error, validation_error]
ks = np.array(range(0, 7))
MSEs = [get_training_and_val_MSE_for_degree_k_model(k, training_set, validation_set) for k in ks]
MSEs_and_k = pd.DataFrame(MSEs, columns = ['k', 'Training MSE', 'Validation MSE'])
MSEs_and_k
k | Training MSE | Validation MSE | |
---|---|---|---|
0 | 0 | 60.235744 | 106.925296 |
1 | 1 | 30.756678 | 22.363676 |
2 | 2 | 29.875269 | 17.331880 |
3 | 3 | 29.180868 | 21.889257 |
4 | 4 | 28.214850 | 27.340989 |
5 | 5 | 25.290990 | 130.599765 |
6 | 6 | 23.571025 | 129.209502 |
ks = np.array(range(0, 7))
results = []
for k in ks:
results.append(get_training_and_val_MSE_for_degree_k_model(k, training_set, validation_set))
MSE_experiment_results = pd.DataFrame(results, columns = ["k", "Training MSE", "Validation MSE"])
fig = px.line(MSE_experiment_results, x="k", y=MSE_experiment_results.columns, markers=True,
color_discrete_map={"Training MSE": 'royalblue',
"Validation MSE": 'rgb(242,183,1)'})
fig.update_layout(font_size=20,
xaxis_title="Polynomial Degree (k)",
yaxis_title="MSE",
legend_title="",
legend=dict(x=0.07, y=.94, bordercolor="Black", borderwidth=2),
margin=dict(l=50, r=50, b=0, t=1),
showlegend = True)
fig.show()
# fig.write_image("training_and_validation_error_vs_degree.png")
We can also make another split in the data to have training, validation, and test datasets.
np.random.seed(320)
# Splitting the data into training, validation, and test set
train_set, val_set, test_set = np.split(shuffle(vehicle_data_sample_35), [25, 30])
test_set
mpg | cylinders | displacement | hp | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
359 | 28.1 | 4 | 141.0 | 80.0 | 3230 | 20.4 | 81 | europe | peugeot 505s turbo diesel |
109 | 21.0 | 4 | 140.0 | 72.0 | 2401 | 19.5 | 73 | usa | chevrolet vega |
63 | 14.0 | 8 | 400.0 | 175.0 | 4385 | 12.0 | 72 | usa | pontiac catalina |
302 | 34.5 | 4 | 105.0 | 70.0 | 2150 | 14.9 | 79 | usa | plymouth horizon tc3 |
223 | 15.5 | 8 | 318.0 | 145.0 | 4140 | 13.7 | 77 | usa | dodge monaco brougham |
In this part we will use k-fold cross validation.
from sklearn.model_selection import KFold
np.random.seed(25)
def train_with_crossvalidation(degree, folds):
kf = KFold(n_splits=folds, shuffle=True)
validation_errors = []
X = vehicle_data_sample_35[["hp"]]
Y = vehicle_data_sample_35["mpg"]
model = Pipeline([
('transform', PolynomialFeatures(degree = degree)),
('regression', LinearRegression(fit_intercept = True))
])
for train_idx, valid_idx in kf.split(X):
# Split the data
split_X_train, split_X_valid = X.iloc[train_idx], X.iloc[valid_idx]
split_Y_train, split_Y_valid = Y.iloc[train_idx], Y.iloc[valid_idx]
# Fit the model on the training split
model.fit(split_X_train, split_Y_train)
error = mean_squared_error(model.predict(split_X_valid), split_Y_valid)
validation_errors.append(error)
return np.mean(validation_errors)
# Running the pipeline for polynomial degrees 0 up to 7 and folds 2 through 5.
ks = np.array(range(0, 7))
folds = np.array(range(2, 6))
MSEs_CV = pd.DataFrame(columns = ['k', 'folds', 'MSE'])
for k in ks:
for f in folds:
MSEs_CV = MSEs_CV.append({'k': k, 'folds': f, 'MSE': train_with_crossvalidation(k, f)},
ignore_index=True)
MSEs_CV['k'] = MSEs_CV['k'].astype('int')
MSEs_CV['folds'] = MSEs_CV['folds'].astype('int')
MSEs_CV
k | folds | MSE | |
---|---|---|---|
0 | 0 | 2 | 74.932817 |
1 | 0 | 3 | 77.805343 |
2 | 0 | 4 | 76.023334 |
3 | 0 | 5 | 76.508954 |
4 | 1 | 2 | 32.213340 |
5 | 1 | 3 | 40.461963 |
6 | 1 | 4 | 30.009160 |
7 | 1 | 5 | 29.871185 |
8 | 2 | 2 | 29.749581 |
9 | 2 | 3 | 31.515481 |
10 | 2 | 4 | 32.115767 |
11 | 2 | 5 | 32.041207 |
12 | 3 | 2 | 43.428425 |
13 | 3 | 3 | 41.287040 |
14 | 3 | 4 | 37.727481 |
15 | 3 | 5 | 36.535043 |
16 | 4 | 2 | 204.783036 |
17 | 4 | 3 | 89.502549 |
18 | 4 | 4 | 44.567345 |
19 | 4 | 5 | 35.260131 |
20 | 5 | 2 | 528.843477 |
21 | 5 | 3 | 25896.873825 |
22 | 5 | 4 | 1865.577482 |
23 | 5 | 5 | 1060.025244 |
24 | 6 | 2 | 7844.756777 |
25 | 6 | 3 | 26607.702808 |
26 | 6 | 4 | 6487.723372 |
27 | 6 | 5 | 22264.478167 |
# Looking into MSEs for different degree polynomials
MSEs_CV.groupby("k")["MSE"].mean().to_frame()
MSE | |
---|---|
k | |
0 | 76.317612 |
1 | 33.138912 |
2 | 31.355509 |
3 | 39.744497 |
4 | 93.528265 |
5 | 7337.830007 |
6 | 15801.165281 |
# Remember what features we had in the vehicle_data dataframe
vehicle_data
mpg | cylinders | displacement | hp | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 |
392 rows × 9 columns
# Building a complex dataset of first order and second order features
poly_transform = PolynomialFeatures(degree=2, include_bias=False)
vehicle_data_with_squared_features = \
pd.DataFrame(poly_transform.fit_transform(vehicle_data[["hp", "weight", "displacement"]]),
columns = poly_transform.get_feature_names_out())
vehicle_data_with_squared_features
hp | weight | displacement | hp^2 | hp weight | hp displacement | weight^2 | weight displacement | displacement^2 | |
---|---|---|---|---|---|---|---|---|---|
0 | 130.0 | 3504.0 | 307.0 | 16900.0 | 455520.0 | 39910.0 | 12278016.0 | 1075728.0 | 94249.0 |
1 | 165.0 | 3693.0 | 350.0 | 27225.0 | 609345.0 | 57750.0 | 13638249.0 | 1292550.0 | 122500.0 |
2 | 150.0 | 3436.0 | 318.0 | 22500.0 | 515400.0 | 47700.0 | 11806096.0 | 1092648.0 | 101124.0 |
3 | 150.0 | 3433.0 | 304.0 | 22500.0 | 514950.0 | 45600.0 | 11785489.0 | 1043632.0 | 92416.0 |
4 | 140.0 | 3449.0 | 302.0 | 19600.0 | 482860.0 | 42280.0 | 11895601.0 | 1041598.0 | 91204.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
387 | 86.0 | 2790.0 | 140.0 | 7396.0 | 239940.0 | 12040.0 | 7784100.0 | 390600.0 | 19600.0 |
388 | 52.0 | 2130.0 | 97.0 | 2704.0 | 110760.0 | 5044.0 | 4536900.0 | 206610.0 | 9409.0 |
389 | 84.0 | 2295.0 | 135.0 | 7056.0 | 192780.0 | 11340.0 | 5267025.0 | 309825.0 | 18225.0 |
390 | 79.0 | 2625.0 | 120.0 | 6241.0 | 207375.0 | 9480.0 | 6890625.0 | 315000.0 | 14400.0 |
391 | 82.0 | 2720.0 | 119.0 | 6724.0 | 223040.0 | 9758.0 | 7398400.0 | 323680.0 | 14161.0 |
392 rows × 9 columns
# Fitting a regression model with L2 regularization
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=10**-5)
ridge_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])
Ridge(alpha=1e-05)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge(alpha=1e-05)
ridge_model.coef_
array([-1.35872588e-01, -1.46864458e-04, -1.18230336e-01, -4.03590098e-04, -1.12862371e-05, 8.25179864e-04, -1.17645881e-06, 2.69757832e-05, -1.72888463e-04])
ridge_model = Ridge(alpha=10000)
ridge_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])
Ridge(alpha=10000)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Ridge(alpha=10000)
ridge_model.coef_
array([-5.56414449e-02, -7.93804083e-03, -8.22081425e-02, -6.18785466e-04, -2.55492157e-05, 9.47353944e-04, 7.58061062e-07, 1.07439477e-05, -1.64344898e-04])
# Fitting a regression model without regularization
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
linear_model.coef_
array([-1.35872588e-01, -1.46864447e-04, -1.18230336e-01, -4.03590097e-04, -1.12862370e-05, 8.25179863e-04, -1.17645882e-06, 2.69757832e-05, -1.72888463e-04])
# Fitting a regression model with L1 regularization
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha = 10)
lasso_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])
Lasso(alpha=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Lasso(alpha=10)
lasso_model.coef_
array([-0.00000000e+00, -1.88104942e-02, -0.00000000e+00, -1.19625308e-03, 8.84657720e-06, 8.77253835e-04, 3.16759194e-06, -3.21738391e-05, -1.29386937e-05])
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
rescaled_df = pd.DataFrame(ss.fit_transform(vehicle_data_with_squared_features),
columns = ss.get_feature_names_out())
rescaled_df
hp | weight | displacement | hp^2 | hp weight | hp displacement | weight^2 | weight displacement | displacement^2 | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.664133 | 0.620540 | 1.077290 | 0.459979 | 0.528582 | 0.738743 | 0.492385 | 0.791843 | 0.926137 |
1 | 1.574594 | 0.843334 | 1.488732 | 1.513418 | 1.227955 | 1.562694 | 0.741147 | 1.206419 | 1.500790 |
2 | 1.184397 | 0.540382 | 1.182542 | 1.031336 | 0.800830 | 1.098529 | 0.406079 | 0.824195 | 1.065981 |
3 | 1.184397 | 0.536845 | 1.048584 | 1.031336 | 0.798784 | 1.001539 | 0.402311 | 0.730474 | 0.888852 |
4 | 0.924265 | 0.555706 | 1.029447 | 0.735454 | 0.652885 | 0.848203 | 0.422448 | 0.726585 | 0.864198 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
387 | -0.480448 | -0.221125 | -0.520637 | -0.509696 | -0.451563 | -0.548450 | -0.329472 | -0.518158 | -0.592298 |
388 | -1.364896 | -0.999134 | -0.932079 | -0.988411 | -1.038886 | -0.871565 | -0.923326 | -0.869957 | -0.799593 |
389 | -0.532474 | -0.804632 | -0.568479 | -0.544385 | -0.665978 | -0.580780 | -0.789800 | -0.672604 | -0.620267 |
390 | -0.662540 | -0.415627 | -0.712005 | -0.627538 | -0.599621 | -0.666685 | -0.492872 | -0.662710 | -0.698072 |
391 | -0.584501 | -0.303641 | -0.721574 | -0.578258 | -0.528400 | -0.653846 | -0.400009 | -0.646113 | -0.702933 |
392 rows × 9 columns
ridge_model = Ridge(alpha=10000)
ridge_model.fit(rescaled_df, vehicle_data["mpg"])
ridge_model.coef_
array([-0.1792743 , -0.19610513, -0.18648617, -0.1601219 , -0.18015125, -0.16858023, -0.18779478, -0.18176294, -0.17021841])