by Lisa Yan
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
# Big font helper
def adjust_fontsize(size=None):
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
if size != None:
SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
plt.style.use('fivethirtyeight')
%matplotlib inline
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
use_good = False
lw_df = pd.read_csv("data/little_women.csv")
linear_fits = [
# a, b, label, color
( 4745, 87, "Least Squares Fit", colors[0]),
(-4000, 150, "Suboptimal Fit", colors[2]),
#(50000, -50, "Bad Fit", colors[3])
]
# Create 2 subplots stacked into 2 columns (i.e. 1 x 2)
nplots = len(linear_fits)
fig, axs = plt.subplots(1, nplots, figsize=(4*nplots, 3.5))
for i in range(nplots):
ax = axs[i]
a, b, label, color = linear_fits[i]
# Plot data and line
sns.scatterplot(data=lw_df, x="Periods", y="Characters", ax=ax)
xlims = np.array([50, 450])
ax.plot(xlims, a + b * xlims, lw=2, label=label, c=color)
# Plot residuals for some points
sample = [[131, 14431], [231, 20558], [392, 40935], [157, 23524]]
for x, y in sample:
yhat = a + b*x
ax.plot([x, x], [y, yhat], color='r', lw=2)
ax.set_title(label)
ax.set_ylim((9000, 55000))
fig.tight_layout()
dugongs = pd.read_csv("data/dugongs.csv")
data_constant = dugongs["Age"]
data_linear = dugongs[["Length", "Age"]]
Computes constant loss surface
plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline
def mse_constant(theta, data):
return np.mean(np.array([(y_obs - theta) ** 2 for y_obs in data]), axis=0)
thetas = np.linspace(-20, 42, 1000)
l2_loss_thetas = mse_constant(thetas, data_constant)
plt.plot(thetas, l2_loss_thetas)
plt.xlabel(r'$\hat{\theta}_0$')
plt.ylabel(r'MSE')
# Optimal point
thetahat = np.mean(data_constant)
plt.scatter([thetahat], [mse_constant(thetahat, data_constant)], s=50)
# plt.savefig('mse_constant_loss.png', bbox_inches = 'tight');
plt.show()
Computes 3D loss surface
# Just run this cell
import itertools
import plotly.graph_objects as go
def mse_linear(theta_0, theta_1, data_linear):
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
return np.mean(np.array([(y - (theta_0+theta_1*x)) ** 2 for x, y in zip(data_x, data_y)]), axis=0)
theta_0_values = np.linspace(-80, 20, 80)
theta_1_values = np.linspace(-10,30, 80)
mse_values = [mse_linear(theta_0, theta_1, data_linear) \
for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mse_values = np.reshape(mse_values, (len(theta_0_values), len(theta_1_values)), order='F')
# Optimal point
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
theta_1_hat = np.corrcoef(data_x, data_y)[0, 1] * np.std(data_y) / np.std(data_x)
theta_0_hat = np.mean(data_y) - theta_1_hat * np.mean(data_x)
fig_lin = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mse_values,name='z=loss')])
fig_lin.add_trace(go.Scatter3d(x=[theta_0_hat], y=[theta_1_hat], \
z=[mse_linear(theta_0_hat, theta_1_hat, data_linear)],\
mode='markers', marker=dict(size=12),name='theta hat'))
fig_lin.update_layout(
title=r'MSE for different values of $theta_0, theta_1$',
autosize=False,
scene = dict(
xaxis_title='theta_0',
yaxis_title='theta_1',
zaxis_title='z=MSE'),
width=500,
margin=dict(r=20, b=10, l=10, t=10))
fig_lin.show()
print("Least Squares Constant Model RMSE:",
np.sqrt(mse_constant(thetahat, data_constant)))
print("Least Squares Linear Model RMSE: ",
np.sqrt(mse_linear(theta_0_hat, theta_1_hat, data_linear)))
Least Squares Constant Model RMSE: 7.722422059764398 Least Squares Linear Model RMSE: 4.311332860288157
Interpret the RMSE (Root Mean Square Error):
This plotting code is left for your reference. We'll mainly look at the figures in lecture.
yobs = data_linear["Age"] # The true observations y
xs = data_linear["Length"] # Needed for linear predictions
n = len(yobs) # Predictions
yhats_constant = [thetahat for i in range(n)] # Not used, but food for thought
yhats_linear = [theta_0_hat + theta_1_hat * x for x in xs]
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
fig = plt.figure(figsize=(8, 1.5))
sns.rugplot(yobs, height=0.25, lw=2) ;
plt.axvline(thetahat, color='red', lw=4, label=r"$\hat{\theta}_0$");
plt.legend()
plt.yticks([])
# plt.savefig('dugong_rug.png', bbox_inches = 'tight');
plt.show()
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, yhats_linear, color='red', lw=4)
# plt.savefig('dugong_line.png', bbox_inches = 'tight');
plt.show()
boba = np.array([20, 21, 22, 29, 33])
Let's plot the $L_1$ loss for a single observation. We'll plot the $L_1$ loss for the first observation; since $y_1 = 20$, we'll be plotting
$$L_1(20, \theta_0) = |20 - \theta_0|$$thetas = np.linspace(10, 30, 1000)
l1_loss_single_obvs = np.abs(boba[0] - thetas)
plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline
plt.plot(thetas, l1_loss_single_obvs, 'g--', );
plt.xlabel(r'$\theta_0$');
plt.ylabel(r'L1 Loss for $y = 20$');
# plt.savefig('l1_loss_single_obs.png', bbox_inches = 'tight');
def mae_constant(theta_0, data):
return np.mean(np.array([np.abs(y_obs - theta_0) for y_obs in data]), axis=0)
thetas = np.linspace(10, 40, 1000)
l1_loss_thetas = mae_constant(thetas, boba)
plt.plot(thetas, l1_loss_thetas, color = 'green', lw=3);
plt.xlabel(r'$\theta_0$');
plt.ylabel(r'MAE across all data');
# plt.savefig('l1_loss_average.png', bbox_inches = 'tight');
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
yobs = boba
thetahat = np.median(yobs)
fig = plt.figure(figsize=(8, 1.5))
sns.rugplot(yobs, height=0.25, lw=2) ;
plt.scatter([thetahat], [-.1], color='green', lw=4, label=r"$\hat{\theta}_0$");
plt.xlabel("Sales")
plt.legend()
plt.yticks([])
# plt.savefig('boba_rug.png', bbox_inches = 'tight');
plt.show()
plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline
def plot_losses(data, title=None, theta_range=(10, 40)):
thetas = np.linspace(theta_range[0], theta_range[1], 1000)
l2_loss_thetas = mse_constant(thetas, data)
thetahat_mse = np.mean(data)
l1_loss_thetas = mae_constant(thetas, data)
thetahat_mae = np.median(data)
fig, axs = plt.subplots(1, nplots, figsize=(5*2+0.5, 3.5))
axs[0].plot(thetas, l2_loss_thetas, lw=3);
axs[0].scatter([thetahat_mse], [mse_constant(thetahat_mse, data)], s=100)
axs[0].annotate(r"$\hat{\theta}_0$ = " + f"{thetahat_mse:.1f}",
xy=(thetahat_mse, np.average(axs[0].get_ylim())),
size=20, ha='center', va='top',
bbox=dict(boxstyle='round', fc='w'))
axs[0].set_xlabel(r'$\theta_0$');
axs[0].set_ylabel(r'MSE');
axs[1].plot(thetas, l1_loss_thetas, color = 'green', lw=3);
axs[1].scatter([thetahat_mae], [mae_constant(thetahat_mae, data)], color='green', s=100)
axs[1].annotate(r"$\hat{\theta}_0$ = " + f"{thetahat_mae:.1f}",
xy=(thetahat_mae, np.average(axs[1].get_ylim())),
size=20, ha='center', va='top',
bbox=dict(boxstyle='round', fc='w'))
axs[1].set_xlabel(r'$\theta_0$');
axs[1].set_ylabel(r'MAE');
if title:
fig.suptitle(title)
fig.tight_layout()
return fig
fig = plot_losses(boba)
plt.figure(fig)
# plt.savefig('loss_compare.png', bbox_inches = 'tight');
plt.show()
boba_outlier = np.array([20, 21, 22, 29, 33, 1033])
fig = plot_losses(boba_outlier, theta_range=[-10, 300])
plt.figure(fig)
# plt.savefig('loss_outlier.png', bbox_inches = 'tight');
plt.show()
boba_even = np.array([20, 21, 22, 29, 33, 35])
fig = plot_losses(boba_even)
plt.figure(fig)
#plt.savefig('loss_unique.png', bbox_inches = 'tight');
plt.show()
Josh asked after lecture: The MSE (average L2) loss curve for Simple Linear Regression is smooth and parabolic in 3 dimensions. What does the MAE for SLR curve look like?
It's beyond the scope of this course, but for the curious, the below cell plots the mean absolute error (average L1 loss) on the boba dataset for a simple linear regression model . I don't plot the minimum because I don't know if there's a closed form solution (you could solve it numerically with techniques you learn in lab).
Since the boba dataset didn't have input, I've arbitrarily used "temperature of that day" as input. I made up the numbers (assume degrees Farenheit).
boba_df = pd.DataFrame({"sales": [20, 21, 22, 29, 33],
"temps": [40, 44, 55, 70, 85]},
columns=["temps", "sales"])
boba_df
temps | sales | |
---|---|---|
0 | 40 | 20 |
1 | 44 | 21 |
2 | 55 | 22 |
3 | 70 | 29 |
4 | 85 | 33 |
# Just run this cell
import itertools
import plotly.graph_objects as go
data_linear = boba_df
def mAe_linear(theta_0, theta_1, data_linear):
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
return np.mean(np.array([np.abs(y - (theta_0+theta_1*x)) for x, y in zip(data_x, data_y)]), axis=0)
theta_0_values = np.linspace(-20, 20, 80)
theta_1_values = np.linspace(-50, 50, 80)
mAe_values = [mAe_linear(theta_0, theta_1, data_linear) \
for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mAe_values = np.reshape(mAe_values, (len(theta_0_values), len(theta_1_values)), order='F')
# Optimal point
fig_lin_mae = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mAe_values,name='z=loss')])
fig_lin_mae.update_layout(
title=r'MAE for different values of $\theta_0, \theta_1$',
autosize=False,
scene = dict(
xaxis_title='x=theta_0',
yaxis_title='y=theta_1',
zaxis_title='z=MAE'),
width=500,
margin=dict(r=20, b=10, l=10, t=10))
fig_lin_mae.show()
# Helper functions
def standard_units(x):
return (x - np.mean(x)) / np.std(x)
def correlation(x, y):
return np.mean(standard_units(x) * standard_units(y))
def slope(x, y):
return correlation(x, y) * np.std(y) / np.std(x)
def intercept(x, y):
return np.mean(y) - slope(x, y)*np.mean(x)
def fit_least_squares(x, y):
theta_0 = intercept(x, y)
theta_1 = slope(x, y)
return theta_0, theta_1
def predict(x, theta_0, theta_1):
return theta_0 + theta_1*x
def compute_mse(y, yhat):
return np.mean((y - yhat)**2)
plt.style.use('default') # Revert style to default mpl
NO_VIZ, RESID, RESID_SCATTER = range(3)
def least_squares_evaluation(x, y, visualize=NO_VIZ):
# statistics
print(f"x_mean : {np.mean(x):.2f}, y_mean : {np.mean(y):.2f}")
print(f"x_stdev: {np.std(x):.2f}, y_stdev: {np.std(y):.2f}")
print(f"r = Correlation(x, y): {correlation(x, y):.3f}")
# Performance metrics
ahat, bhat = fit_least_squares(x, y)
yhat = predict(x, ahat, bhat)
print(f"\theta_0: {ahat:.2f}, \theta_1: {bhat:.2f}")
print(f"RMSE: {np.sqrt(compute_mse(y, yhat)):.3f}")
# visualization
fig, ax_resid = None, None
if visualize == RESID_SCATTER:
fig, axs = plt.subplots(1,2,figsize=(8, 3))
axs[0].scatter(x, y)
axs[0].plot(x, yhat)
axs[0].set_title("LS fit")
ax_resid = axs[1]
elif visualize == RESID:
fig = plt.figure(figsize=(4, 3))
ax_resid = plt.gca()
if ax_resid is not None:
ax_resid.scatter(x, y - yhat, color = 'red')
ax_resid.plot([4, 14], [0, 0], color = 'black')
ax_resid.set_title("Residuals")
return fig
# Load in four different datasets: I, II, III, IV
x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]
y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]
y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]
y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]
x4 = [8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8]
y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]
anscombe = {
'I': pd.DataFrame(list(zip(x, y1)), columns =['x', 'y']),
'II': pd.DataFrame(list(zip(x, y2)), columns =['x', 'y']),
'III': pd.DataFrame(list(zip(x, y3)), columns =['x', 'y']),
'IV': pd.DataFrame(list(zip(x4, y4)), columns =['x', 'y'])
}
fig, axs = plt.subplots(2, 2, figsize = (10, 10))
for i, dataset in enumerate(['I', 'II', 'III', 'IV']):
ans = anscombe[dataset]
axs[i//2, i%2].scatter(ans['x'], ans['y'])
axs[i//2, i%2].set_title(f"Dataset {dataset}")
plt.show()
Wow, looks like all four datasets have the same:
for dataset in ['I', 'II', 'III', 'IV']:
print(f">>> Dataset {dataset}:")
ans = anscombe[dataset]
fig = least_squares_evaluation(ans['x'], ans['y'], visualize = RESID)
plt.show(fig)
print()
print()
>>> Dataset I: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.119
>>> Dataset II: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.119
>>> Dataset III: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.816 heta_0: 3.00, heta_1: 0.50 RMSE: 1.118
>>> Dataset IV: x_mean : 9.00, y_mean : 7.50 x_stdev: 3.16, y_stdev: 1.94 r = Correlation(x, y): 0.817 heta_0: 3.00, heta_1: 0.50 RMSE: 1.118
dugongs = pd.read_csv("data/dugongs.csv")
dugongs.head()
Length | Age | |
---|---|---|
0 | 1.77 | 1.5 |
1 | 1.80 | 1.0 |
2 | 1.85 | 1.5 |
3 | 1.87 | 1.5 |
4 | 2.02 | 2.5 |
yobs = dugongs["Age"] # The true observations y
xs = dugongs["Length"] # Needed for linear predictions
theta_1_hat = np.corrcoef(xs, yobs)[0, 1] * np.std(yobs) / np.std(xs)
theta_0_hat = np.mean(yobs) - theta_1_hat * np.mean(xs)
yhats_linear = theta_0_hat + theta_1_hat * xs
np.corrcoef(xs, yobs)[0, 1]
0.8296474554905716
The correlation coefficient is pretty high...but there's an issue.
Let's first plot the Dugong linear fit again. It doesn't look so bad if we see it here.
# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, yhats_linear, color='red', lw=4)
# plt.savefig('dugong_line.png', bbox_inches = 'tight');
plt.show()
Let's further inspect by plotting residuals.
residuals = yobs - yhats_linear
sns.scatterplot(x=xs, y=residuals, color='red', lw=4)
plt.axhline(0, color='black')
plt.ylabel(r"$y - \hat{y}$")
# plt.savefig('dugong_residuals.png', bbox_inches = 'tight');
plt.show()
We could fit a line to the linear model that relates $ z = \log(y)$ to $x$:
$$ \hat{z} = \theta_0 + \theta_1 x$$dugongs["Log(Age)"] = np.log(dugongs["Age"])
zobs = dugongs["Log(Age)"] # The LOG of true observations y
xs = dugongs["Length"] # Needed for linear predictions
ztheta_1_hat = np.corrcoef(xs, zobs)[0, 1] * np.std(zobs) / np.std(xs)
ztheta_0_hat = np.mean(zobs) - ztheta_1_hat * np.mean(xs)
zhats_linear = ztheta_0_hat + ztheta_1_hat * xs
sns.scatterplot(x=xs, y=zobs, color='green')
plt.plot(xs, zhats_linear, lw=4)
# plt.savefig('dugong_zline.png', bbox_inches = 'tight');
plt.show()
zresiduals = zobs - zhats_linear
sns.scatterplot(x=xs, y=zresiduals, color='red', lw=4)
plt.axhline(0, color='black')
plt.ylabel(r"$z - \hat{z}$")
# plt.savefig('dugong_zresiduals.png', bbox_inches = 'tight');
plt.show()
ypred = np.exp(zhats_linear)
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, ypred, color='green', lw=4)
# plt.savefig('dugong_curve.png', bbox_inches = 'tight');
plt.show()