import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
In this notebook, we discuss:
In the process, we will work through feature engineering to construct a model that predicts vehicle efficiency.
For this notebook, we will use the seaborn mpg
data set which describes the fuel mileage (measured in miles per gallon or mpg) of various cars along with characteristics of those cars. Our goal will be to build a model that can predict the fuel mileage of a car based on the characteristics of that car.
from seaborn import load_dataset
data = load_dataset("mpg")
data
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130.0 | 3504 | 12.0 | 70 | usa | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165.0 | 3693 | 11.5 | 70 | usa | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150.0 | 3436 | 11.0 | 70 | usa | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150.0 | 3433 | 12.0 | 70 | usa | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140.0 | 3449 | 10.5 | 70 | usa | ford torino |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
393 | 27.0 | 4 | 140.0 | 86.0 | 2790 | 15.6 | 82 | usa | ford mustang gl |
394 | 44.0 | 4 | 97.0 | 52.0 | 2130 | 24.6 | 82 | europe | vw pickup |
395 | 32.0 | 4 | 135.0 | 84.0 | 2295 | 11.6 | 82 | usa | dodge rampage |
396 | 28.0 | 4 | 120.0 | 79.0 | 2625 | 18.6 | 82 | usa | ford ranger |
397 | 31.0 | 4 | 119.0 | 82.0 | 2720 | 19.4 | 82 | usa | chevy s-10 |
398 rows × 9 columns
This data set has several quantitative continuous features that we can use to build our first model. However, even for quantitative continuous features, we may want to do some additional feature engineering. Things to consider are:
We can use the Pandas DataFrame.isna
function to find rows with missing values:
data[data.isna().any(axis=1)]
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
32 | 25.0 | 4 | 98.0 | NaN | 2046 | 19.0 | 71 | usa | ford pinto |
126 | 21.0 | 6 | 200.0 | NaN | 2875 | 17.0 | 74 | usa | ford maverick |
330 | 40.9 | 4 | 85.0 | NaN | 1835 | 17.3 | 80 | europe | renault lecar deluxe |
336 | 23.6 | 4 | 140.0 | NaN | 2905 | 14.3 | 80 | usa | ford mustang cobra |
354 | 34.5 | 4 | 100.0 | NaN | 2320 | 15.8 | 81 | europe | renault 18i |
374 | 23.0 | 4 | 151.0 | NaN | 3035 | 20.5 | 82 | usa | amc concord dl |
There are many ways to deal with missing values. A common strategy is to substitute the mean. Because missing values can actually be useful signal, it is often a good idea to include a feature indicating that the value was missing.
def phi_cont(df):
Phi = df[["cylinders", "displacement",
"horsepower", "weight",
"acceleration",
"model_year"]].copy()
Phi["horsepower_missing"] = Phi["horsepower"].isna()
Phi = Phi.fillna(Phi.mean())
return Phi
Using our feature function, we can fit our first model to the transformed data:
model = LinearRegression()
model.fit(phi_cont(data), data[["mpg"]])
LinearRegression()
Because we are going to be building multiple models with different feature functions it is important to have a standard way to track each of the models.
The following function takes a model prediction function, the name of a model, and the dictionary of models that we have already constructed. It then evaluates the new model on the data and plots how the new model performs relative to the previous models as well as the $Y$ vs $\hat{Y}$ scatter plot.
In addition, it updates the dictionary of models to include the new model for future plotting.
def evaluate_model(name, model, phi, models=dict()):
# run the prediction function and compute the RMSE
Yhat = model.predict(phi(data)).flatten()
Y = data['mpg'].to_numpy()
rmse = np.sqrt(mean_squared_error(Y, Yhat))
print("Root Mean Squared Error:", rmse)
# Save the model and rmse to the collection of models
models[name] = dict(model=model, phi=phi, rmse=rmse)
# Generate diagnostic and model comparison plots
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(x=Yhat, y=Y, mode="markers"), row=1, col=1)
fig.update_xaxes(title = "Yhat", row=1, col=1)
fig.update_yaxes(title = "Y", row=1, col=1)
ymin = np.min(Yhat)
ymax = np.max(Yhat)
fig.add_trace(go.Scatter(x=[ymin,ymax], y=[ymin,ymax], name="y=yhat"), row=1, col=1)
fig.add_trace(go.Bar(x=list(models.keys()),
y=[models[k]['rmse'] for k in models]), row=1, col=2)
fig.update_layout(showlegend=False)
fig.update_yaxes(title = "RMSE", row=1, col=2)
fig.show()
models = {}
evaluate_model("cont.", model, phi_cont, models)
Root Mean Squared Error: 3.4140204828031737