import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import itertools
from mpl_toolkits.mplot3d import Axes3D
# Big font helper
def adjust_fontsize(size=None):
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
if size != None:
SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
Evaluate the Simple Linear Regression Model¶
We will first define some helper functions to calculate the regression line for us.
# Helper functions
def standard_units(x):
return (x - np.mean(x)) / np.std(x)
def correlation(x, y):
return np.mean(standard_units(x) * standard_units(y))
def slope(x, y):
return correlation(x, y) * np.std(y) / np.std(x)
def intercept(x, y):
return np.mean(y) - slope(x, y)*np.mean(x)
def fit_least_squares(x, y):
theta_0 = intercept(x, y)
theta_1 = slope(x, y)
return theta_0, theta_1
def predict(x, theta_0, theta_1):
return theta_0 + theta_1*x
def compute_mse(y, yhat):
return np.mean((y - yhat)**2)
plt.style.use('default') # Revert style to default mpl
NO_VIZ, RESID, RESID_SCATTER = range(3)
def least_squares_evaluation(x, y, visualize=NO_VIZ):
# statistics
print(f"x_mean : {np.mean(x):.2f}, y_mean : {np.mean(y):.2f}")
print(f"x_stdev: {np.std(x):.2f}, y_stdev: {np.std(y):.2f}")
print(f"r = Correlation(x, y): {correlation(x, y):.3f}")
# Performance metrics
ahat, bhat = fit_least_squares(x, y)
yhat = predict(x, ahat, bhat)
print(f"\theta_0: {ahat:.2f}, \theta_1: {bhat:.2f}")
print(f"RMSE: {np.sqrt(compute_mse(y, yhat)):.3f}")
# visualization
fig, ax_resid = None, None
if visualize == RESID_SCATTER:
fig, axs = plt.subplots(1,2,figsize=(8, 3))
axs[0].scatter(x, y)
axs[0].plot(x, yhat)
axs[0].set_title("LS fit")
ax_resid = axs[1]
elif visualize == RESID:
fig = plt.figure(figsize=(4, 3))
ax_resid = plt.gca()
if ax_resid is not None:
ax_resid.scatter(x, y - yhat, color = 'red')
ax_resid.plot([4, 14], [0, 0], color = 'black')
ax_resid.set_title("Residuals")
return fig
Let's look at four different datasets. Without visualizing the data, let's try fitting the simple linear regression model to these four datasets!
# Load in four different datasets: I, II, III, IV
x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]
y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]
y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]
y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]
x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]
y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]
anscombe = {
'I': pd.DataFrame(list(zip(x, y1)), columns =['x', 'y']),
'II': pd.DataFrame(list(zip(x, y2)), columns =['x', 'y']),
'III': pd.DataFrame(list(zip(x, y3)), columns =['x', 'y']),
'IV': pd.DataFrame(list(zip(x4, y4)), columns =['x', 'y'])
}
for i, dataset in enumerate(['I', 'II', 'III', 'IV']):
ans = anscombe[dataset]
x, y = ans["x"], ans["y"]
ahat, bhat = fit_least_squares(x, y)
print(f"Dataset {dataset}: theta_0: {ahat:.2f}, theta_1: {bhat:.2f}")
Dataset I: theta_0: 3.00, theta_1: 0.50 Dataset II: theta_0: 3.00, theta_1: 0.50 Dataset III: theta_0: 3.00, theta_1: 0.50 Dataset IV: theta_0: 3.00, theta_1: 0.50
Looks like all four datasets have exactly the same fit.
In fact, all four of them have the same $\bar x$, $\bar y$, $\sigma_x$, $\sigma_y$, correlation $r$, and RMSE! If we only look at these statistics, we will probably be inclined to say that these datasets are similar.
for dataset in ['I', 'II', 'III', 'IV']:
print(f">>> Dataset {dataset}:")
ans = anscombe[dataset]
fig = least_squares_evaluation(ans['x'], ans['y'], visualize = NO_VIZ)
print()
print()
>>> Dataset I: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.119 >>> Dataset II: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.119 >>> Dataset III: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.118 >>> Dataset IV: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.817 heta_0: 3.00, heta_1: 0.50 RMSE: 1.118
However, it is only when we visualize the datasets that we realize only one of these four sets of data makes sense to model using SLR.
fig, axs = plt.subplots(2, 2, figsize = (10, 10))
for i, dataset in enumerate(['I', 'II', 'III', 'IV']):
ans = anscombe[dataset]
axs[i//2, i%2].scatter(ans['x'], ans['y'])
axs[i//2, i%2].set_title(f"Dataset {dataset}")
plt.show()
Checking the residuals will also shed light on the differences among these four datasets.
If you remember from Data 8, "the residual plot of a good regression shows no pattern".
for dataset in ['I', 'II', 'III', 'IV']:
print(f">>> Dataset {dataset}:")
ans = anscombe[dataset]
fig = least_squares_evaluation(ans['x'], ans['y'], visualize = RESID_SCATTER)
plt.show(fig)
print()
print()
>>> Dataset I: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.119
>>> Dataset II: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.119
>>> Dataset III: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.118
>>> Dataset IV: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.817 heta_0: 3.00, heta_1: 0.50 RMSE: 1.118
The takeaway here is that you should always visualize your datasets before fitting any models to it!
Dugongs Part 1: Comparing Two Different Models, Both Fit with MSE¶
dugongs = pd.read_csv("data/dugongs.csv")
data_constant = dugongs["Age"]
data_linear = dugongs[["Length", "Age"]]
Loss Surfaces¶
Computes constant loss surface. As a reminder, the average loss of the constant model is
$$ \Large \hat{R}(\theta_0) = \frac{1}{n}\sum_{i=1}^n (y_i - \theta_0)^2 $$
plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline
def mse_constant(theta, data):
return np.mean(np.array([(y_obs - theta) ** 2 for y_obs in data]), axis=0)
thetas = np.linspace(-20, 42, 1000)
l2_loss_thetas = mse_constant(thetas, data_constant)
plt.plot(thetas, l2_loss_thetas)
plt.xlabel(r'$\theta_0$')
plt.ylabel(r'MSE')
# Optimal point
thetahat = np.mean(data_constant)
plt.scatter([thetahat], [mse_constant(thetahat, data_constant)], s=50, label = r"$\hat{\theta}_0$")
plt.legend()
# plt.savefig('mse_constant_loss.png', bbox_inches = 'tight');
plt.show()
Computes 3D loss surface. As a reminder, the average loss for the SLR model is
$$ \Large \hat{R}(\theta_0, \theta_1) = \frac{1}{n}\sum_{i=1}^n (y_i - (\theta_0 + \theta_1x))^2 $$
# Just run this cell
import itertools
import plotly.graph_objects as go
def mse_linear(theta_0, theta_1, data_linear):
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
return np.mean(np.array([(y - (theta_0+theta_1*x)) ** 2 for x, y in zip(data_x, data_y)]), axis=0)
theta_0_values = np.linspace(-80, 20, 80)
theta_1_values = np.linspace(-10,30, 80)
mse_values = [mse_linear(theta_0, theta_1, data_linear) \
for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mse_values = np.reshape(mse_values, (len(theta_0_values), len(theta_1_values)), order='F')
# Optimal point
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
theta_1_hat = np.corrcoef(data_x, data_y)[0, 1] * np.std(data_y) / np.std(data_x)
theta_0_hat = np.mean(data_y) - theta_1_hat * np.mean(data_x)
fig_lin = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mse_values,name='z=MSE')])
fig_lin.add_trace(go.Scatter3d(x=[theta_0_hat], y=[theta_1_hat], \
z=[mse_linear(theta_0_hat, theta_1_hat, data_linear)],\
mode='markers', marker=dict(size=12),name='theta hat'))
fig_lin.update_layout(
autosize=False,
scene = dict(
xaxis_title='theta_0',
yaxis_title='theta_1',
zaxis_title='z=MSE'),
width=500,
margin=dict(r=20, b=10, l=10, t=10))
fig_lin.show()
RMSE¶
print("Least Squares Constant Model RMSE:",
np.sqrt(mse_constant(thetahat, data_constant)))
print("Least Squares Linear Model RMSE: ",
np.sqrt(mse_linear(theta_0_hat, theta_1_hat, data_linear)))
Least Squares Constant Model RMSE: 7.722422059764398 Least Squares Linear Model RMSE: 4.311332860288157
Interpret the RMSE (Root Mean Square Error):
- Constant model error is HIGHER than linear model error
- Linear model is BETTER than constant model (at least for this metric)
Predictions¶
This plotting code is left for your reference. We'll mainly look at the figures in lecture.
yobs = data_linear["Age"] # The true observations y
xs = data_linear["Length"] # Needed for linear predictions
n = len(yobs) # Predictions
yhats_constant = [thetahat for i in range(n)] # Not used, but food for thought
yhats_linear = [theta_0_hat + theta_1_hat * x for x in xs]
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
fig = plt.figure(figsize=(8, 1.5))
sns.rugplot(yobs, height=0.25, lw=2) ;
plt.axvline(thetahat, color='red', lw=4, label=r"$\hat{\theta}_0$");
plt.legend()
plt.yticks([])
# plt.savefig('dugong_rug.png', bbox_inches = 'tight');
plt.show()
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, yhats_linear, color='red', lw=4)
# plt.savefig('dugong_line.png', bbox_inches = 'tight');
plt.show()
boba = np.array([20, 21, 22, 29, 33])
Let's plot the $L_1$ loss for a single observation. We'll plot the $L_1$ loss for the first observation; since $y_1 = 20$, we'll be plotting
$$ \Large L_1(20, \theta_0) = |20 - \theta_0| $$
thetas = np.linspace(10, 30, 1000)
l1_loss_single_obvs = np.abs(boba[0] - thetas)
plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline
plt.plot(thetas, l1_loss_single_obvs, 'g--', );
plt.xlabel(r'$\theta_0$');
plt.ylabel(r'L1 Loss for $y = 20$');
# plt.savefig('l1_loss_single_obs.png', bbox_inches = 'tight');
def mae_constant(theta_0, data):
return np.mean(np.array([np.abs(y_obs - theta_0) for y_obs in data]), axis=0)
thetas = np.linspace(10, 40, 1000)
l1_loss_thetas = mae_constant(thetas, boba)
plt.plot(thetas, l1_loss_thetas, color = 'green', lw=3);
plt.xlabel(r'$\theta_0$');
plt.ylabel(r'MAE across all data');
# plt.savefig('l1_loss_average.png', bbox_inches = 'tight');
Median Minimizes MAE for the Constant Model¶
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
yobs = boba
thetahat = np.median(yobs)
fig = plt.figure(figsize=(8, 1.5))
sns.rugplot(yobs, height=0.25, lw=2) ;
plt.scatter([thetahat], [-.1], color='green', lw=4, label=r"$\hat{\theta}_0$");
plt.xlabel("Sales")
plt.legend()
plt.yticks([])
# plt.savefig('boba_rug.png', bbox_inches = 'tight');
plt.show()
Two Constant Models, Fit to Different Losses¶
plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline
nplots = 2
def plot_losses(data, title=None, theta_range=(10, 40)):
thetas = np.linspace(theta_range[0], theta_range[1], 1000)
l2_loss_thetas = mse_constant(thetas, data)
thetahat_mse = np.mean(data)
l1_loss_thetas = mae_constant(thetas, data)
thetahat_mae = np.median(data)
fig, axs = plt.subplots(1, nplots, figsize=(5*2+0.5, 3.5))
axs[0].plot(thetas, l2_loss_thetas, lw=3);
axs[0].scatter([thetahat_mse], [mse_constant(thetahat_mse, data)], s=100)
axs[0].annotate(r"$\hat{\theta}_0$ = " + f"{thetahat_mse:.1f}",
xy=(thetahat_mse, np.average(axs[0].get_ylim())),
size=20, ha='center', va='top',
bbox=dict(boxstyle='round', fc='w'))
axs[0].set_xlabel(r'$\theta_0$');
axs[0].set_ylabel(r'MSE');
axs[1].plot(thetas, l1_loss_thetas, color = 'green', lw=3);
axs[1].scatter([thetahat_mae], [mae_constant(thetahat_mae, data)], color='green', s=100)
axs[1].annotate(r"$\hat{\theta}_0$ = " + f"{thetahat_mae:.1f}",
xy=(thetahat_mae, np.average(axs[1].get_ylim())),
size=20, ha='center', va='top',
bbox=dict(boxstyle='round', fc='w'))
axs[1].set_xlabel(r'$\theta_0$');
axs[1].set_ylabel(r'MAE');
if title:
fig.suptitle(title)
fig.tight_layout()
return fig
fig = plot_losses(boba)
plt.figure(fig)
# plt.savefig('loss_compare.png', bbox_inches = 'tight');
plt.show()
More loss comparison: Outliers¶
boba_outlier = np.array([20, 21, 22, 29, 33, 1033])
fig = plot_losses(boba_outlier, theta_range=[-10, 300])
plt.figure(fig)
# plt.savefig('loss_outlier.png', bbox_inches = 'tight');
plt.show()
Uniqueness under Different Loss Functions¶
boba_even = np.array([20, 21, 22, 29, 33, 35])
fig = plot_losses(boba_even)
plt.figure(fig)
#plt.savefig('loss_unique.png', bbox_inches = 'tight');
plt.show()
Bonus: MAE loss curve for SLR?¶
We saw earlier that the MSE (average L2) loss curve for Simple Linear Regression is smooth and parabolic in 3 dimensions. What does the MAE for SLR curve look like?
It's beyond the scope of this course, but for the curious, the below cell plots the mean absolute error (average L1 loss) on the boba dataset for a simple linear regression model . I don't plot the minimum because I don't know if there's a closed form solution (you could solve it numerically with techniques you learn in lab).
Since the boba dataset didn't have input, I've arbitrarily used "temperature of that day" as input. I made up the numbers (assume degrees Farenheit).
boba_df = pd.DataFrame({"sales": [20, 21, 22, 29, 33],
"temps": [40, 44, 55, 70, 85]},
columns=["temps", "sales"])
boba_df
temps | sales | |
---|---|---|
0 | 40 | 20 |
1 | 44 | 21 |
2 | 55 | 22 |
3 | 70 | 29 |
4 | 85 | 33 |
# Just run this cell
import itertools
import plotly.graph_objects as go
data_linear = boba_df
def mAe_linear(theta_0, theta_1, data_linear):
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
return np.mean(np.array([np.abs(y - (theta_0+theta_1*x)) for x, y in zip(data_x, data_y)]), axis=0)
theta_0_values = np.linspace(-20, 20, 80)
theta_1_values = np.linspace(-50, 50, 80)
mAe_values = [mAe_linear(theta_0, theta_1, data_linear) \
for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mAe_values = np.reshape(mAe_values, (len(theta_0_values), len(theta_1_values)), order='F')
# Optimal point
fig_lin_mae = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mAe_values,name='z=MAE')])
fig_lin_mae.update_layout(
autosize=False,
scene = dict(
xaxis_title='x=theta_0',
yaxis_title='y=theta_1',
zaxis_title='z=MAE'),
width=500,
margin=dict(r=20, b=10, l=10, t=10))
fig_lin_mae.show()
Now observe the difference between the MAE loss and the MSE loss for one application.
# Just run this cell
def mSe_linear(theta_0, theta_1, data_linear):
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
return np.mean(np.array([(y - (theta_0+theta_1*x))**2 for x, y in zip(data_x, data_y)]), axis=0)
theta_0_values = np.linspace(-20, 20, 80)
theta_1_values = np.linspace(-50, 50, 80)
mSe_values = [mSe_linear(theta_0, theta_1, data_linear) \
for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mSe_values = np.reshape(mSe_values, (len(theta_0_values), len(theta_1_values)), order='F')
# Optimal point
fig_lin_mae = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mSe_values,name='z=MSE')])
fig_lin_mae.update_layout(
autosize=False,
scene = dict(
xaxis_title='x=theta_0',
yaxis_title='y=theta_1',
zaxis_title='z=MSE'),
width=500,
margin=dict(r=20, b=10, l=10, t=10))
fig_lin_mae.show()
dugongs = pd.read_csv("data/dugongs.csv")
dugongs.head()
Length | Age | |
---|---|---|
0 | 1.77 | 1.5 |
1 | 1.80 | 1.0 |
2 | 1.85 | 1.5 |
3 | 1.87 | 1.5 |
4 | 2.02 | 2.5 |
yobs = dugongs["Age"] # The true observations y
xs = dugongs["Length"] # Needed for linear predictions
theta_1_hat = np.corrcoef(xs, yobs)[0, 1] * np.std(yobs) / np.std(xs)
theta_0_hat = np.mean(yobs) - theta_1_hat * np.mean(xs)
yhats_linear = theta_0_hat + theta_1_hat * xs
np.corrcoef(xs, yobs)[0, 1]
0.8296474554905716
The correlation coefficient is pretty high...but there's an issue.
Let's first plot the Dugong linear fit again. It doesn't look so bad if we see it here.
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, yhats_linear, color='red', lw=4)
# plt.savefig('dugong_line.png', bbox_inches = 'tight');
plt.show()
Let's further inspect by plotting residuals.
residuals = yobs - yhats_linear
sns.scatterplot(x=xs, y=residuals, color='red', lw=4)
plt.axhline(0, color='black')
plt.ylabel(r"$y - \hat{y}$")
# plt.savefig('dugong_residuals.png', bbox_inches = 'tight');
plt.show()
Log transformation of y¶
We could fit a line to the linear model that relates $ z = \log(y)$ to $x$:
$$ \Large \hat{z} = \theta_0 + \theta_1 x $$
dugongs["Log(Age)"] = np.log(dugongs["Age"])
zobs = dugongs["Log(Age)"] # The LOG of true observations y
xs = dugongs["Length"] # Needed for linear predictions
ztheta_1_hat = np.corrcoef(xs, zobs)[0, 1] * np.std(zobs) / np.std(xs)
ztheta_0_hat = np.mean(zobs) - ztheta_1_hat * np.mean(xs)
zhats_linear = ztheta_0_hat + ztheta_1_hat * xs
sns.scatterplot(x=xs, y=zobs, color='green')
plt.plot(xs, zhats_linear, lw=4)
# plt.savefig('dugong_zline.png', bbox_inches = 'tight');
plt.show()
zresiduals = zobs - zhats_linear
sns.scatterplot(x=xs, y=zresiduals, color='red', lw=4)
plt.axhline(0, color='black')
plt.ylabel(r"$z - \hat{z}$")
# plt.savefig('dugong_zresiduals.png', bbox_inches = 'tight');
plt.show()
Map back to the original coordinates¶
$$ \begin{align*} \hat{z} &= \theta_0 + \theta_1 x\\ \widehat{\log(y)}&= \theta_0 + \theta_1 x\\ e^{\widehat{\log(y)}}&= e^{\theta_0 + \theta_1 x}\\ \hat{y}&=e^{\theta_0 + \theta_1 x}\ \end{align*} $$
ypred = np.exp(zhats_linear)
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, ypred, color='green', lw=4)
# plt.savefig('dugong_curve.png', bbox_inches = 'tight');
plt.show()