One of the key challenges with feature engineering is that you can "over engineer" your features and produce a model that fits the data but performs poorly when making predictions on new data. This is typically referred to as overfitting to your data and is the focus on the next set of lectures.
In this notebook, we will provide a very simple illustration of overfitting, but as you will see and soon experience, it is very easy to overfit to your data and this will become the key challenge in the design of good models.
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
For this problem we will use a very simple toy dataset to help illustrate where things will fail.
Notice that there are only 8 datapoints in this dataset. Small data is especially prone to the challenges of overfitting.
data = pd.read_csv("data/train.csv")
data
X | Y | |
---|---|---|
0 | -2.647582 | 9.140452 |
1 | -2.051547 | 5.336237 |
2 | -1.810665 | 7.195181 |
3 | -1.312076 | 6.095358 |
4 | -0.789591 | 0.721947 |
5 | -0.660964 | 2.177008 |
6 | 0.806148 | 4.367994 |
7 | 1.054880 | 5.852682 |
colors = px.colors.qualitative.Plotly
data_scatter = go.Scatter(x=data["X"], y=data["Y"], name="data", mode="markers",
marker=dict(color=colors[0]))
go.Figure([data_scatter])
model = LinearRegression()
model.fit(data[["X"]], data["Y"])
LinearRegression()
As before we define a helper routine to track our progress in model design.
def evaluate_model(name, model, phi, models=dict()):
# run the prediction function and compute the RMSE
Yhat = model.predict(phi(data[["X"]].to_numpy()) ).flatten()
Y = data['Y'].to_numpy()
rmse = np.sqrt(mean_squared_error(Y, Yhat))
print("Root Mean Squared Error:", rmse)
# Save the model and rmse to the collection of models
models[name] = dict(model=model, phi=phi, rmse=rmse)
# Generate diagnostic and model comparison plots
fig = make_subplots(rows=1, cols=2)
fig.add_trace(data_scatter, row=1, col=1)
xgrid = np.expand_dims(np.linspace(-3, 1.4, 100),1)
for (i,m) in enumerate(models):
fig.add_trace(go.Scatter(x=xgrid.flatten(),
y=models[m]["model"].predict(models[m]["phi"](xgrid)).flatten(),
marker=dict(color=colors[i+2]),
name=m), row=1, col=1)
fig.update_xaxes(title = "X", row=1, col=1)
fig.update_yaxes(title = "Y", range=[0,10], row=1, col=1)
fig.add_trace(go.Bar(x=list(models.keys()),
y=[models[k]['rmse'] for k in models],
marker=dict(color=colors[0]), showlegend=False), row=1, col=2)
fig.update_yaxes(title = "RMSE", row=1, col=2)
fig.show()
models = {}
evaluate_model("Linear in X", model, lambda x: x, models )
Root Mean Squared Error: 2.272965710209297
How could we improve the model fit?
def phi_polynomials(x, p=3):
return np.hstack([x**i for i in range(1,p+1)])
phi_polynomials(np.array([[1],[2],[3],[4],[5]]))
array([[ 1, 1, 1], [ 2, 4, 8], [ 3, 9, 27], [ 4, 16, 64], [ 5, 25, 125]])
model = LinearRegression()
model.fit(phi_polynomials(data[["X"]].to_numpy(), p=2), data[["Y"]])
evaluate_model("Quadratic", model, lambda x: phi_polynomials(x, p=2), models)
Root Mean Squared Error: 1.3646273702703906
model = LinearRegression()
model.fit(phi_polynomials(data[["X"]].to_numpy(), p=3), data[["Y"]])
evaluate_model("Cubic", model, lambda x: phi_polynomials(x, p=3), models)
Root Mean Squared Error: 1.1528916542943797
model = LinearRegression()
model.fit(phi_polynomials(data[["X"]].to_numpy(), p=4), data[["Y"]])
evaluate_model("Quartic", model, lambda x: phi_polynomials(x, p=4), models)
Root Mean Squared Error: 1.1445074375807343
model = LinearRegression()
model.fit(phi_polynomials(data[["X"]].to_numpy(), p=5), data[["Y"]])
evaluate_model("Quintic", model, lambda x: phi_polynomials(x, p=5), models)
Root Mean Squared Error: 0.7148394938839464
model = LinearRegression()
model.fit(phi_polynomials(data[["X"]].to_numpy(), p=8), data[["Y"]])
evaluate_model("Octic", model, lambda x: phi_polynomials(x, p=8), models)
Root Mean Squared Error: 6.102130856184464e-12
What happens if we get more data from the world?
test_data = pd.read_csv("data/test.csv")
test_data
X | Y | |
---|---|---|
0 | -2.972389 | 12.829875 |
1 | -2.897078 | 11.633153 |
2 | -2.872904 | 11.871463 |
3 | -2.828057 | 8.954620 |
4 | -2.773864 | 10.577305 |
... | ... | ... |
95 | 1.753572 | 9.678197 |
96 | 1.828160 | 11.248137 |
97 | 1.847923 | 11.934869 |
98 | 1.849549 | 11.872792 |
99 | 1.934435 | 13.345564 |
100 rows × 2 columns
Plotting this new data (in red) on top of the old data we see that while the more complex RBF model fit the original data perfectly, it does not fit the new
test_data_scatter = go.Scatter(name = "Test Data", x = test_data['X'], y = test_data['Y'],
mode = 'markers', marker=dict(symbol="cross", color=colors[1]))
go.Figure([data_scatter, test_data_scatter])
What happens if we plot this data on top of our previous picture?
def better_evaluate_model(name, model, phi, models=dict()):
# Save the model and rmse to the collection of models
models[name] = dict(model=model, phi=phi)
# Generate diagnostic and model comparison plots
fig = make_subplots(rows=1, cols=2)
fig.add_trace(data_scatter, row=1, col=1)
fig.add_trace(test_data_scatter, row=1, col=1)
xgrid = np.expand_dims(np.linspace(-4, 3, 100),1)
for (i,m) in enumerate(models):
model = models[m]["model"]
phi = models[m]["phi"]
# run the prediction function and compute the RMSE
fig.add_trace(go.Scatter(x=xgrid.flatten(),
y=model.predict(phi(xgrid)).flatten(),
marker=dict(color=colors[i+2]),
name=m), row=1, col=1)
if "train_rmse" not in models[m]:
# Evaluate train and test
Yhat_train = model.predict(phi(data[["X"]].to_numpy()) ).flatten()
Y_train = data['Y'].to_numpy()
models[m]['train_rmse'] = np.sqrt(mean_squared_error(Y_train, Yhat_train))
Yhat_test = model.predict(phi(test_data[["X"]].to_numpy()) ).flatten()
Y_test = test_data['Y'].to_numpy()
models[m]['test_rmse'] = np.sqrt(mean_squared_error(Y_test, Yhat_test))
fig.update_xaxes(title = "X", row=1, col=1)
fig.update_yaxes(title = "Y", range=[0,10], row=1, col=1)
fig.add_trace(go.Bar(x=list(models.keys()),
y=[models[k]['train_rmse'] for k in models],
marker=dict(color=colors[0]), name="Training Error"), row=1, col=2)
fig.add_trace(go.Bar(x=list(models.keys()),
y=[models[k]['test_rmse'] for k in models],
marker=dict(color=colors[1]), name="Testing Error"), row=1, col=2)
fig.update_yaxes(title = "RMSE", row=1, col=2)
fig.show()
The following plots the training and test error. Try zooming in to see what happens to training error and testing error as we increase the number of features in our model.
model = LinearRegression()
model.fit(phi_polynomials(data[["X"]].to_numpy(), p=8), data[["Y"]])
better_evaluate_model("Octic", model, lambda x: phi_polynomials(x, p=8), models)
In the rest of this lecture we will dig into the ideas drive this behavior.
As we increase the expressiveness of our model we begin to over-fit to the variability in our training data. That is we are learning patterns that do not generalize beyond our training dataset
Over-fitting is a key challenge in machine learning and statistical inference. At it's core is a fundamental trade-off between bias and variance: the desire to explain the training data and yet be robust to variation in the training data.
We will study the bias-variance trade-off in today's lecture.