import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import itertools
from mpl_toolkits.mplot3d import Axes3D

# Big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

dugongs = pd.read_csv("data/dugongs.csv")
display(dugongs.head())

data_constant = dugongs["Age"]
data_linear = dugongs[["Length", "Age"]]

plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline

# Calculates the MSE for a constant model given some value of theta
def mse_constant(theta, data):
    return np.mean(np.array([(y_obs - theta) ** 2 for y_obs in data]), axis=0)

# Set up a range of theta values and calculate the MSE for each
thetas = np.linspace(-20, 42, 1000)
l2_loss_thetas = mse_constant(thetas, data_constant)

# Plot the MSE as a function of the thetas
plt.plot(thetas, l2_loss_thetas)
plt.xlabel(r'$\theta_0$')
plt.ylabel(r'MSE')

# Optimal point
thetahat = np.mean(data_constant)
plt.scatter([thetahat], [mse_constant(thetahat, data_constant)], s=50, label = r"$\hat{\theta}_0$")
plt.legend()
# plt.savefig('mse_constant_loss.png', bbox_inches = 'tight');
plt.show()

# Run this cell to see a 3D visualization of the loss surface
import itertools
import plotly.graph_objects as go

# Calculates the MSE for a linear model given some values of theta_0 and theta_1
def mse_linear(theta_0, theta_1, data_linear):
    data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
    return np.mean(np.array([(y - (theta_0+theta_1*x)) ** 2 for x, y in zip(data_x, data_y)]), axis=0)

# Set up a grid of theta values and calculate the MSE for each
theta_0_values = np.linspace(-80, 20, 80)
theta_1_values = np.linspace(-10,30, 80)
mse_values = [mse_linear(theta_0, theta_1, data_linear) \
              for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mse_values = np.reshape(mse_values, (len(theta_0_values), len(theta_1_values)), order='F')

# Optimal point
data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
theta_1_hat = np.corrcoef(data_x, data_y)[0, 1] * np.std(data_y) / np.std(data_x)
theta_0_hat = np.mean(data_y) - theta_1_hat * np.mean(data_x)
fig_lin = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mse_values,name='z=MSE')])
fig_lin.add_trace(go.Scatter3d(x=[theta_0_hat], y=[theta_1_hat], \
                               z=[mse_linear(theta_0_hat, theta_1_hat, data_linear)],\
                               mode='markers', marker=dict(size=12),name='theta hat'))

fig_lin.update_layout(
    autosize=False,
    scene = dict(
                    xaxis_title='theta_0',
                    yaxis_title='theta_1',
                    zaxis_title='z=MSE'),
                    width=500,
                    margin=dict(r=20, b=10, l=10, t=10))

fig_lin.show()

print("Least Squares Constant Model RMSE:",
          round(np.sqrt(mse_constant(thetahat, data_constant)), 2)
      )
print("Least Squares Linear Model RMSE:  ",
          round(np.sqrt(mse_linear(theta_0_hat, theta_1_hat, data_linear)),2)
      )

Least Squares Constant Model RMSE: 7.72
Least Squares Linear Model RMSE:   4.31

yobs = data_linear["Age"]      # The true observations y
xs = data_linear["Length"]     # Needed for linear predictions
n = len(yobs)                  # Predictions

yhats_constant = [thetahat for i in range(n)]    # Not used, but food for thought
yhats_linear = [theta_0_hat + theta_1_hat * x for x in xs]

sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline

fig = plt.figure(figsize=(8, 1.5))
sns.rugplot(yobs, height=0.25, lw=2) ;
plt.axvline(thetahat, color='red', lw=4, label=r"$\hat{\theta}_0$");
plt.legend()
plt.yticks([])
# plt.savefig('dugong_rug.png', bbox_inches = 'tight');
plt.show()

# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline

sns.scatterplot(x=xs, y=yobs)
plt.axhline(y=thetahat, color='red', lw=4)
# plt.savefig('dugong_constant_line.png', bbox_inches = 'tight');
plt.show()

# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline

sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, yhats_linear, color='red', lw=4)
# plt.savefig('dugong_line.png', bbox_inches = 'tight');
plt.show()

boba = np.array([20, 21, 22, 29, 33])

thetas = np.linspace(10, 30, 1000)
l1_loss_single_obvs = np.abs(boba[0] - thetas)

plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline

plt.plot(thetas, l1_loss_single_obvs,  'g--', );
plt.xlabel(r'$\theta_0$');
plt.ylabel(r'L1 Loss for $y = 20$');
# plt.savefig('l1_loss_single_obs.png', bbox_inches = 'tight');

def mae_constant(theta_0, data):
    return np.mean(np.array([np.abs(y_obs - theta_0) for y_obs in data]), axis=0)

thetas = np.linspace(10, 40, 1000)
l1_loss_thetas = mae_constant(thetas, boba)
plt.plot(thetas, l1_loss_thetas, color = 'green', lw=3);
plt.xlabel(r'$\theta_0$');
plt.ylabel(r'MAE across all data');
# plt.savefig('l1_loss_average.png', bbox_inches = 'tight');

# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline

yobs = boba
thetahat = np.median(yobs)

fig = plt.figure(figsize=(8, 1.5))
sns.rugplot(yobs, height=0.25, lw=2) ;
plt.scatter([thetahat], [-.1], color='green', lw=4, label=r"$\hat{\theta}_0$");
plt.xlabel("Sales")
plt.legend()
plt.yticks([])
# plt.savefig('boba_rug.png', bbox_inches = 'tight');
plt.show()

plt.style.use('default') # Revert style to default mpl
adjust_fontsize(size=16)
%matplotlib inline

nplots = 2
def plot_losses(data, title=None, theta_range=(10, 40)):
    thetas = np.linspace(theta_range[0], theta_range[1], 1000)
    l2_loss_thetas = mse_constant(thetas, data)
    thetahat_mse = np.mean(data)

    l1_loss_thetas = mae_constant(thetas, data)
    thetahat_mae = np.median(data)

    fig, axs = plt.subplots(1, nplots, figsize=(5*2+0.5, 3.5))
    axs[0].plot(thetas, l2_loss_thetas, lw=3);
    axs[0].scatter([thetahat_mse], [mse_constant(thetahat_mse, data)], s=100)
    axs[0].annotate(r"$\hat{\theta}_0$ = " + f"{thetahat_mse:.1f}",
                    xy=(thetahat_mse, np.average(axs[0].get_ylim())),
                    size=20, ha='center', va='top',
                    bbox=dict(boxstyle='round', fc='w'))
    axs[0].set_xlabel(r'$\theta_0$');
    axs[0].set_ylabel(r'MSE');

    axs[1].plot(thetas, l1_loss_thetas, color = 'green', lw=3);
    axs[1].scatter([thetahat_mae], [mae_constant(thetahat_mae, data)], color='green', s=100)
    axs[1].annotate(r"$\hat{\theta}_0$ = " + f"{thetahat_mae:.1f}",
                    xy=(thetahat_mae, np.average(axs[1].get_ylim())),
                    size=20, ha='center', va='top',
                    bbox=dict(boxstyle='round', fc='w'))
    axs[1].set_xlabel(r'$\theta_0$');
    axs[1].set_ylabel(r'MAE');
    if title:
        fig.suptitle(title)
    fig.tight_layout()
    return fig

fig = plot_losses(boba)
plt.figure(fig)
# plt.savefig('loss_compare.png', bbox_inches = 'tight');
plt.show()

boba_outlier = np.array([20, 21, 22, 29, 33, 1033])
fig = plot_losses(boba_outlier, theta_range=[-10, 300])
plt.figure(fig)
# plt.savefig('loss_outlier.png', bbox_inches = 'tight');
plt.show()

boba_even = np.array([20, 21, 22, 29, 33, 35])
fig = plot_losses(boba_even)
plt.figure(fig)
#plt.savefig('loss_unique.png', bbox_inches = 'tight');
plt.show()

boba_df = pd.DataFrame({"sales": [20, 21, 22, 29, 33],
                        "temps": [40, 44, 55, 70, 85]},
                       columns=["temps", "sales"])
boba_df

# Just run this cell
import itertools
import plotly.graph_objects as go

data_linear = boba_df

def mAe_linear(theta_0, theta_1, data_linear):
    data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
    return np.mean(np.array([np.abs(y - (theta_0+theta_1*x)) for x, y in zip(data_x, data_y)]), axis=0)

theta_0_values = np.linspace(-20, 20, 80)
theta_1_values = np.linspace(-50, 50, 80)
mAe_values = [mAe_linear(theta_0, theta_1, data_linear) \
              for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mAe_values = np.reshape(mAe_values, (len(theta_0_values), len(theta_1_values)), order='F')

# Optimal point
fig_lin_mae = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mAe_values,name='z=MAE')])

fig_lin_mae.update_layout(
    autosize=False,
    scene = dict(
                    xaxis_title='x=theta_0',
                    yaxis_title='y=theta_1',
                    zaxis_title='z=MAE'),
                    width=500,
                    margin=dict(r=20, b=10, l=10, t=10))
fig_lin_mae.show()

# Just run this cell

def mSe_linear(theta_0, theta_1, data_linear):
    data_x, data_y = data_linear.iloc[:,0], data_linear.iloc[:,1]
    return np.mean(np.array([(y - (theta_0+theta_1*x))**2 for x, y in zip(data_x, data_y)]), axis=0)

theta_0_values = np.linspace(-20, 20, 80)
theta_1_values = np.linspace(-50, 50, 80)
mSe_values = [mSe_linear(theta_0, theta_1, data_linear) \
              for theta_0, theta_1 in itertools.product(theta_0_values, theta_1_values)]
mSe_values = np.reshape(mSe_values, (len(theta_0_values), len(theta_1_values)), order='F')

# Optimal point
fig_lin_mae = go.Figure(data=[go.Surface(x=theta_0_values, y=theta_1_values, z=mSe_values,name='z=MSE')])

fig_lin_mae.update_layout(
    autosize=False,
    scene = dict(
                    xaxis_title='x=theta_0',
                    yaxis_title='y=theta_1',
                    zaxis_title='z=MSE'),
                    width=500,
                    margin=dict(r=20, b=10, l=10, t=10))
fig_lin_mae.show()

dugongs = pd.read_csv("data/dugongs.csv")
dugongs.head()

yobs = dugongs["Age"]      # The true observations y
xs = dugongs["Length"]     # Needed for linear predictions

theta_1_hat = np.corrcoef(xs, yobs)[0, 1] * np.std(yobs) / np.std(xs)
theta_0_hat = np.mean(yobs) - theta_1_hat * np.mean(xs)
yhats_linear = theta_0_hat + theta_1_hat * xs

np.corrcoef(xs, yobs)[0, 1]

0.8296474554905716

# In case we're in a weird style state
sns.set_theme()
adjust_fontsize(size=16)
%matplotlib inline

sns.scatterplot(x=xs, y=yobs)
# plt.savefig('dugong_scatter.pdf', bbox_inches = 'tight');
plt.show()

sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, yhats_linear, color='red', lw=1)
# plt.savefig('dugong_line.pdf', bbox_inches = 'tight');
plt.show()

residuals = yobs - yhats_linear

sns.scatterplot(x=xs, y=residuals, color='red', lw=1)
plt.axhline(0, color='black')
plt.ylabel(r"$Age - \widehat{Age}$")
# plt.savefig('dugong_residuals.pdf', bbox_inches = 'tight');
plt.show()

dugongs["Log(Age)"] = np.log(dugongs["Age"])
zobs = dugongs["Log(Age)"]      # The LOG of true observations y
xs = dugongs["Length"]     # Needed for linear predictions

ztheta_1_hat = np.corrcoef(xs, zobs)[0, 1] * np.std(zobs) / np.std(xs)
ztheta_0_hat = np.mean(zobs) - ztheta_1_hat * np.mean(xs)
zhats_linear = ztheta_0_hat + ztheta_1_hat * xs

sns.scatterplot(x=xs, y=zobs)
# plt.savefig('dugong_zscatter.pdf', bbox_inches = 'tight');

sns.scatterplot(x=xs, y=zobs)
plt.plot(xs, zhats_linear, color='red', lw=1)
# plt.savefig('dugong_zline.pdf', bbox_inches = 'tight');
plt.show()

zresiduals = zobs - zhats_linear

sns.scatterplot(x=xs, y=zresiduals, color='red', lw=4)
plt.axhline(0, color='black')
plt.ylabel(r"$\log(Age) - \widehat{\log(Age)}$")
# plt.savefig('dugong_zresiduals.pdf', bbox_inches = 'tight');
plt.show()

ypred = np.exp(zhats_linear)
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, ypred, color='red', lw=1)
# plt.savefig('dugong_zcurve.pdf', bbox_inches = 'tight');
plt.show()

dugongs["Length^2"] = dugongs["Length"] ** 2
xs = dugongs["Length^2"]   # Needed for linear predictions

quad_theta_1_hat = np.corrcoef(xs, yobs)[0, 1] * np.std(yobs) / np.std(xs)
quad_theta_0_hat = np.mean(yobs) - quad_theta_1_hat * np.mean(xs)
yhats_quad = quad_theta_0_hat + quad_theta_1_hat * xs

sns.scatterplot(x=xs, y=yobs)
plt.xlabel(r"$Length^2$")
# plt.savefig('dugong_quad_scatter.pdf', bbox_inches = 'tight');

sns.scatterplot(x=xs, y=yobs)

# Need more granularity or plotted line looks curved
xrange = np.linspace(xs.min(), xs.max(), 1000)
quad_preds_range = quad_theta_0_hat + quad_theta_1_hat * xrange
plt.plot(xrange, quad_preds_range, color='red', lw=1)

plt.xlabel(r"$Length^2$")
# plt.savefig('dugong_quad_line.pdf', bbox_inches = 'tight');
plt.show()

quad_residuals = yobs - quad_hats_linear

sns.scatterplot(x=xs, y=quad_residuals, color='red', lw=4)
plt.axhline(0, color='black')
plt.ylabel(r"$Age - \widehat{Age}$")
plt.xlabel(r"$Length^2$")
# plt.savefig('dugong_quad_residuals.pdf', bbox_inches = 'tight');
plt.show()

xs = dugongs["Length"]  
sns.scatterplot(x=xs, y=yobs)
plt.plot(xs, yhats_quad, color='red', lw=1)
# plt.savefig('dugong_quad_curve.pdf', bbox_inches = 'tight');
plt.show()

	Length	Age
0	1.77	1.5
1	1.80	1.0
2	1.85	1.5
3	1.87	1.5
4	2.02	2.5

	Length	Age
0	1.77	1.5
1	1.80	1.0
2	1.85	1.5
3	1.87	1.5
4	2.02	2.5

📈 Lecture 11 – Data 100, Spring 2025¶

🦭 Dugongs Part 1: Comparing Two Different Models, Both Fit with MSE¶

🎢 Loss Surfaces¶

🧮 Computing the RMSE of SLR and constant model fit with MSE¶

🔮 Predictions¶

🧋 Boba Tea: Two Constant Models, Fit to Different Losses¶

Exploring MAE¶

🥣 Median Minimizes MAE for the Constant Model¶

2️⃣ Two Constant Models, Fit to Different Losses¶

More loss comparison: Outliers¶

Uniqueness under Different Loss Functions¶

Bonus: MAE loss curve for SLR?¶

🦭 Dugongs Part 2¶

Residual Plot¶

Log transformation of y¶

Map back to the original coordinates¶

Squaring transformation of x¶

	temps	sales
0	40	20
1	44	21
2	55	22
3	70	29
4	85	33