import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'

# Load in the penguins dataset provided by seaborn package
df = sns.load_dataset("penguins")

# Filter to only include Adelie species and remove any rows with missing values
df = df[df["species"] == "Adelie"].dropna()

df

df = df[["bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
df

# Construct the design matrix X as a dataframe and add a bias column
X = df[["flipper_length_mm", "body_mass_g"]]
X["bias"] = 1
X

# Construct the outcome vector y
y = df["bill_depth_mm"]
y

0      18.7
1      17.4
2      18.0
4      19.3
5      20.6
       ... 
147    18.4
148    17.8
149    18.1
150    17.1
151    18.5
Name: bill_depth_mm, Length: 146, dtype: float64

# M.T transposes a matrix M
# X @ Y multiplies matrices X and Y
# np.linalg.inv(M) finds the inverse of a matrix M
# By default, the dataframe X will be silently converted to a matrix
theta_using_normal_equation = np.linalg.inv(X.T @ X) @ X.T @ y
theta_using_normal_equation

0     0.009828
1     0.001477
2    11.002995
dtype: float64

# Solves for θ in (XTX)θ = XTy using a system of linear equations
np.linalg.solve(X.T @ X, X.T @ y)

array([9.82848689e-03, 1.47749591e-03, 1.10029953e+01])

# Convert the dataframe representation of the design matrix to a numpy array, 
# and then multiply it by the theta vector to get the predicted outcomes.
# Yhat = Xθ
df["pred_bill_depth_mm"] = X.to_numpy() @ theta_using_normal_equation
df

# Import the LinearRegression class
from sklearn.linear_model import LinearRegression

# Construct a blank LinearRegression model object
model = LinearRegression()
model

LinearRegression()

LinearRegression()

# Fit the linear regression model to the data
model.fit(
    X=df[["flipper_length_mm", "body_mass_g"]], 
    y=df["bill_depth_mm"]
  )

LinearRegression()

LinearRegression()

# Generate a prediction (yhat) by providing the X values to the model.
# The input to .predict() is a list of lists. 
# Each sub-list contains the X values for each observation.
# That's why there are double brackets!
model.predict([[185, 3750.0]])

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

array([18.36187501])

# Can also provide a design matrix or dataframe to .predict()
df["sklearn_preds"] = model.predict(df[["flipper_length_mm", "body_mass_g"]])
df

# There is one intercept term in the model
model.intercept_

11.002995277447074

# There are two slope terms in the model
model.coef_

array([0.00982849, 0.0014775 ])

# vs. analytical solutions
theta_using_normal_equation

0     0.009828
1     0.001477
2    11.002995
dtype: float64

fig = px.scatter_3d(df, x="flipper_length_mm", y="body_mass_g", z="bill_depth_mm")

# Create a grid of points to evaluate plane
grid_resolution = 2
(u,v) = np.meshgrid(
    np.linspace(df["flipper_length_mm"].min(), df["flipper_length_mm"].max(), grid_resolution),
    np.linspace(df["body_mass_g"].min(), df["body_mass_g"].max(), grid_resolution))
features = pd.DataFrame({"flipper_length_mm": u.flatten(),
                         "body_mass_g": v.flatten()})
# Make predictions at every point on the grid
zs = model.predict(features)

# create the Surface
fig.add_trace(go.Surface(x=u, y=v, z= zs.reshape(u.shape), opacity=0.9, showscale=False))
fig.update_layout(autosize=False, width=800, height=600)

# Calculate the MSE for a prediction vector and a ground truth outcome vector
from sklearn.metrics import mean_squared_error
mean_squared_error(df["bill_depth_mm"], df["sklearn_preds"])

0.9764070438843998

# The sklearn approach is flexible to different model types, like trees! 
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor()

tree_model.fit(
    X=df[["flipper_length_mm", "body_mass_g"]], 
    y=df["bill_depth_mm"])

DecisionTreeRegressor()

DecisionTreeRegressor()

df["sklearn_dt_preds"] = tree_model.predict(df[["flipper_length_mm", "body_mass_g"]])

mean_squared_error(df["bill_depth_mm"], df["sklearn_dt_preds"])

0.051107305936073065

fig = px.scatter_3d(df, x="flipper_length_mm", y="body_mass_g", z="bill_depth_mm")

# Create a grid of points to evaluate plane
grid_resolution = 20
(u,v) = np.meshgrid(
    np.linspace(df["flipper_length_mm"].min(), df["flipper_length_mm"].max(), grid_resolution),
    np.linspace(df["body_mass_g"].min(), df["body_mass_g"].max(), grid_resolution))
features = pd.DataFrame({"flipper_length_mm": u.flatten(),
                         "body_mass_g": v.flatten()})

# Make predictions at every point on the grid
zs = tree_model.predict(features) #<------------------ Only change

# create the Surface
fig.add_trace(go.Surface(x=u, y=v, z= zs.reshape(u.shape), opacity=0.9, showscale=False))
fig.update_layout(autosize=False, width=800, height=600)

def f(x):
    return (x**4 - 15*x**3 + 80*x**2 - 180*x + 144)/10

x = np.linspace(1, 6.75, 200)
fig = px.line(y = f(x), x = x)
fig.update_layout(font_size = 16)
fig.update_layout(autosize=False, width=800, height=600)

# np.argmin returns the index of the smallest value in an array
# This function returns the x value corresponding to the smallest
# value of f(x), for an input array of x values
def simple_minimize(f, xs):
    y = [f(x) for x in xs]
    return xs[np.argmin(y)]

guesses = [5.3, 5.31, 5.32, 5.33, 5.34, 5.35]
simple_minimize(f, guesses)

5.33

xs = np.linspace(1, 7, 200)
sparse_xs = np.linspace(1.5, 6.5, 5)

ys = f(xs)
sparse_ys = f(sparse_xs)

fig = px.line(x = xs, y = f(xs))
fig.add_scatter(x = sparse_xs, y = f(sparse_xs), mode = "markers", marker_size=16)
fig.update_layout(showlegend= False)
fig.update_layout(autosize=False, width=800, height=600)
fig.show()

from scipy.optimize import minimize

# x0 is a starting guess for the value of x that minimizes f.
# minimize() will iteratively try to improve this guess.
# Don't worry about all of the details of the output.
# The important part is "x", the value of x that minimizes f.
minimize(f, x0 = 3.5)

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: -0.13827491292945523
        x: [ 2.393e+00]
      nit: 3
      jac: [ 6.484e-06]
 hess_inv: [[ 7.385e-01]]
     nfev: 20
     njev: 10

def deriv_f(x):
    return (1/10) * (4*x**3 - 45*x**2 + 160*x - 180)

f_line = go.Scatter(x = xs, y = f(xs), mode = "lines", name = "f")
derivative_line = go.Scatter(x = xs, y = deriv_f(xs), 
                             mode = "lines", name = "f'", line = {"dash": "dash"})
# add horizontal line at 0
zero_line = go.Scatter(x = xs, y = 0*xs, mode = "lines", name='f(x) = 0')
roots = np.array([2.3927, 3.5309, 5.3263]) # computed using algorithm
root_markers = go.Scatter(x = np.array(roots), y = 0*roots, 
                         mode = "markers", name = "f'(x) = 0", marker_size = 12)

fig = go.Figure()
fig.add_traces([f_line, derivative_line, zero_line, root_markers])
fig.update_layout(font_size = 20, yaxis_range=[-1, 3])
fig.update_layout(autosize=False, width=800, height=600)
fig.show()

x = 4.3
fig = go.Figure()
fig.add_trace(f_line)
# Adding a red arrow in the direction of the gradient.
#  Note the arrow is just a direction along the x dimension 
#  (the y position is just for illustrative purposes)
fig.add_trace(go.Scatter(
    x=[x, x - deriv_f(x)], y=[f(x), f(x)],  
    marker= dict(size=10,symbol= "arrow-bar-up", angleref="previous"),
    name="-f'(x0)"
    ))
# Add the Green circle for our guess
fig.add_trace(go.Scatter(x=[x],y=[f(x)], 
                         marker_color="green", marker_size=12,
                         mode="markers", name="x0"))
fig.update_layout(font_size = 20, yaxis_range=[-1, 3])
fig.update_layout(autosize=False, width=800, height=600)
fig

# `x` is our current guess for the minimum.
# `derivative` is a function that takes in a single value and 
# returns the value of the derivative of f at the given input value.
def take_one_step(x, derivative):
    new_x = x - derivative(x)
    return new_x

x = 4.0
steps = [x]
for i in range(10):
    x = take_one_step(x, deriv_f)
    steps.append(x) 

print(steps)

[4.0, 4.4, 5.0464000000000055, 5.496730601062393, 5.0808624852305115, 5.489980392167775, 5.092824872119241, 5.486755386070718, 5.0984728528436225, 5.485072693208349, 5.101402551267881]

# This code is out-of-scope for data-100 but could be fun to learn.
def plot_steps(steps, f = f, f_line = f_line):
    fig = go.Figure()
    fig.add_trace(f_line)
    fig.add_trace(go.Scatter(x = steps, y = [f(s) for s in steps], 
                             mode = "lines+markers", line = {"dash": "dash", "color": "red"},
                             name = "Path", 
                             marker_symbol="arrow",
                             marker_angleref="previous",
                             marker_standoff=4,
                             marker_size = 16))
    fig.add_trace(go.Scatter(x = steps, y = [f(s) for s in steps], 
                             mode = "markers", 
                             name = "Path",
                             marker_color="red",
                             showlegend=False,
                             marker_size = 8))
    fig.update_layout(font_size = 20)
    fig.update_layout(autosize=False, width=800, height=600)

    return fig

plot_steps(steps)

def take_one_step_lr(x, alpha, derivative):
    # Find our new guess using the recurrence relation
    new_x = x - alpha * derivative(x)
    return new_x

x = 4.0
steps = [x]
for i in range(15):
    x = take_one_step_lr(x, alpha=0.3, derivative = deriv_f)
    print(x)
    steps.append(x) 

plot_steps(steps)

4.12
4.267296639999997
4.442725838159953
4.640926244829146
4.846183704850335
5.032118544823421
5.17201478493924
5.2564844894138165
5.297911492494514
5.315427176589101
5.322260602055931
5.324832983472768
5.325787650752968
5.3261400404400865
5.326269854338316

def gradient_descent(deriv_f, initial_guess, alpha, T):
    """Performs T steps of gradient descent on df using learning rate alpha starting
       from initial_guess. Returns a numpy array of all guesses over time."""
    guesses = [initial_guess]
    current_guess = initial_guess
    while len(guesses) < T:
        current_guess = current_guess - alpha * deriv_f(current_guess)
        guesses.append(current_guess)
        
    return np.array(guesses)

trajectory = gradient_descent(deriv_f, initial_guess=1.6, alpha=0.75, T=20)
print(trajectory)
plot_steps(trajectory)

[1.6        3.3112     3.18920918 3.01472352 2.79207742 2.56776716
 2.42826486 2.39421613 2.39274816 2.39274798 2.39274798 2.39274798
 2.39274798 2.39274798 2.39274798 2.39274798 2.39274798 2.39274798
 2.39274798 2.39274798]

trajectory = gradient_descent(deriv_f, initial_guess=6, alpha=0.75, T=20)
print(trajectory)
plot_steps(trajectory)

[6.         4.2        4.6086     5.12279483 5.38817984 5.28497822
 5.34793725 5.31315502 5.33375146 5.32197109 5.32885604 5.32488006
 5.32719254 5.32585303 5.32663079 5.32617982 5.32644152 5.32628973
 5.32637779 5.32632671]

df = sns.load_dataset("tips")
df.head()

x = df["total_bill"]
y_obs = df["tip"]

def mse_single_arg(theta_1):
    """Returns the MSE on our data for the given theta1"""
    y_hat = theta_1 * x
    return np.mean((y_hat - y_obs) ** 2) 

thetas = np.linspace(-1.5, 1, 100)

mse_line = go.Scatter(x = thetas, y = [mse_single_arg(theta_1) for theta_1 in thetas], mode = "lines", name = "MSE")
fig = go.Figure()
fig.add_trace(mse_line)
fig.update_layout(autosize=False, width=800, height=600, xaxis_title="theta_1", yaxis_title="MSE")

def deriv_mse_single_arg(theta_1):
    """Returns the derivative of the MSE on our data for the given theta1"""
    y_hat = theta_1 * x
    return np.mean(-2 * (y_obs - y_hat) * x)

trajectory = gradient_descent(deriv_mse_single_arg, initial_guess=-0.5, alpha=0.0001, T=100)
print(f"Final guess for theta_1: {trajectory[-1]}")
plot_steps(trajectory,  mse_single_arg,  mse_line)

Final guess for theta_1: 0.14369554654231262

tips_with_bias = df.copy()
tips_with_bias["bias"] = 1
tips_with_bias = tips_with_bias[["bias", "total_bill"]]
tips_with_bias.head()

X = tips_with_bias
y = df["tip"]

def mse_loss(theta):
    y_hat = X @ theta
    return np.mean((y - y_hat) ** 2)

import plotly.graph_objects as go

uvalues = np.linspace(-1, 5, 20)
vvalues = np.linspace(-0.1, 0.35, 20)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))


MSE = np.array([mse_loss(t) for t in thetas.T])

loss_surface = go.Surface(x=u, 
    y=v, z=np.reshape(MSE, u.shape),
    contours = {"z": {"show": True, "start": 0, "end": 50, "size": 2, "color": "white"}})

# This is an approximate guess for the optimal point.
ind = np.argmin(MSE)
optimal_point = go.Scatter3d(name = "Optimal Point",
    x = [thetas.T[ind,0]], y = [thetas.T[ind,1]], 
    z = [MSE[ind]],
    marker=dict(size=10, color="red"))

fig = go.Figure(data=[loss_surface, optimal_point])
fig.update_layout(scene = dict(
    xaxis_title = "theta0",
    yaxis_title = "theta1",
    zaxis_title = "MSE"), autosize=False, width=800, height=600)

fig.show()

contour = go.Contour(x=u[0], y=v[:, 0], z=np.reshape(MSE, u.shape), 
                     contours=dict(start=0, end=70,size=2))
fig = go.Figure(contour)
fig.update_layout(
    xaxis_title = "theta0",
    yaxis_title = "theta1", autosize=False, width=800, height=600)

fig.show()

def mse_gradient(theta):
    """Returns the gradient of the MSE on our data for the given theta"""
    x1 = X.iloc[:, 1]
    dth0 = np.mean(-2 * (y - (theta[0] + theta[1]*x1)))
    dth1 = np.mean(-2 * (y - (theta[0] + theta[1]*x1)) * x1)
    return np.array([dth0, dth1])

# Note that `initial_guess` is now an array of size 2.
guesses = gradient_descent(mse_gradient, initial_guess=np.array([1, .5]), alpha=0.001, T=10000)

pd.DataFrame(guesses, columns=["theta_0", "theta_1"]).tail(10)

minimize(mse_loss, x0 = [0,0])

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 1.0360194420115247
        x: [ 9.203e-01  1.050e-01]
      nit: 3
      jac: [ 1.490e-08 -1.490e-08]
 hess_inv: [[ 2.980e+00 -1.253e-01]
            [-1.253e-01  6.335e-03]]
     nfev: 15
     njev: 5

minimize(mse_loss, x0 = [0,0], jac=mse_gradient)

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 1.036019442011377
        x: [ 9.203e-01  1.050e-01]
      nit: 3
      jac: [ 3.494e-16 -6.989e-16]
 hess_inv: [[ 2.980e+00 -1.253e-01]
            [-1.253e-01  6.335e-03]]
     nfev: 5
     njev: 5

def mse_gradient(theta, X, y):
    """Returns the gradient of the MSE on our data for the given theta"""
    x0 = X.iloc[:, 0]
    x1 = X.iloc[:, 1]
    dth0 = np.mean(-2 * (y - theta[0]*x0 - theta[1]*x1) * x0)
    dth1 = np.mean(-2 * (y - theta[0]*x0 - theta[1]*x1) * x1)
    return np.array([dth0, dth1])

# Run stochastic gradient descent (SGD)
# eta is the initial learning rate, which decays as a function of the time step
def sgd(grad, X, y, initial_theta, eta = 0.3, max_iter = 5000, batch_size=50 ):
    theta = initial_theta
    thetas = [theta]
    n = len(X)
    for t in range(1, max_iter):
        X_sample = X.sample(batch_size)
        y_sample = y.loc[X_sample.index]
        theta = theta - eta/t * grad(theta, X_sample, y_sample)
        thetas.append(theta)
    return thetas

thetas = sgd(mse_gradient, X, y, 
             initial_theta = np.array([1, .5]), 
             eta = 0.001, 
             max_iter = 10000,
             batch_size=1)
thetas[-5:]

[array([0.98110353, 0.10248307]),
 array([0.98110341, 0.10248119]),
 array([0.98110332, 0.1024799 ]),
 array([0.98110344, 0.10248167]),
 array([0.98110334, 0.10248054])]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male
...	...	...	...	...	...	...	...
147	Adelie	Dream	36.6	18.4	184.0	3475.0	Female
148	Adelie	Dream	36.0	17.8	195.0	3450.0	Female
149	Adelie	Dream	37.8	18.1	193.0	3750.0	Male
150	Adelie	Dream	36.0	17.1	187.0	3700.0	Female
151	Adelie	Dream	41.5	18.5	201.0	4000.0	Male

	flipper_length_mm	body_mass_g	bias
0	181.0	3750.0	1
1	186.0	3800.0	1
2	195.0	3250.0	1
4	193.0	3450.0	1
5	190.0	3650.0	1
...	...	...	...
147	184.0	3475.0	1
148	195.0	3450.0	1
149	193.0	3750.0	1
150	187.0	3700.0	1
151	201.0	4000.0	1

	bill_depth_mm	flipper_length_mm	body_mass_g	pred_bill_depth_mm
0	18.7	181.0	3750.0	18.322561
1	17.4	186.0	3800.0	18.445578
2	18.0	195.0	3250.0	17.721412
4	19.3	193.0	3450.0	17.997254
5	20.6	190.0	3650.0	18.263268
...	...	...	...	...
147	18.4	184.0	3475.0	17.945735
148	17.8	195.0	3450.0	18.016911
149	18.1	193.0	3750.0	18.440503
150	17.1	187.0	3700.0	18.307657
151	18.5	201.0	4000.0	18.888505

	bill_depth_mm	flipper_length_mm	body_mass_g	pred_bill_depth_mm	sklearn_preds
0	18.7	181.0	3750.0	18.322561	18.322561
1	17.4	186.0	3800.0	18.445578	18.445578
2	18.0	195.0	3250.0	17.721412	17.721412
4	19.3	193.0	3450.0	17.997254	17.997254
5	20.6	190.0	3650.0	18.263268	18.263268
...	...	...	...	...	...
147	18.4	184.0	3475.0	17.945735	17.945735
148	17.8	195.0	3450.0	18.016911	18.016911
149	18.1	193.0	3750.0	18.440503	18.440503
150	17.1	187.0	3700.0	18.307657	18.307657
151	18.5	201.0	4000.0	18.888505	18.888505

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

⛰️ Lecture 13 – Data 100, Spring 2025¶

Fiting our Multiple Linear Regression Model¶

OLS Approach 1: Use Solution to Normal Equation¶

Make Predictions¶

Using `sklearn` to fit our Multiple Linear Regression Model¶

Visualize the Fit¶

Bonus Demo¶

Minimizing an Arbitrary 1D Function¶

The Naive Approach: Guess and Check¶

`scipy.optimize.minimize`¶

Descending with Derivatives¶

Manually Descending with Derivatives¶

Starting with an initial guess of 4.0 and taking 10 steps:¶

Visualizing the optimization steps¶

The Gradient Descent Algorithm¶

Gradient Descent on a 1D Model¶

Gradient Descent on Multi-Dimensional Models¶

Defining a 2D MSE Function¶

Applying Gradient Descent in 2D¶

Stochastic Gradient Descent on Multi-Dimensional Models¶

	theta_0	theta_1
9990	0.922487	0.104931
9991	0.922486	0.104931
9992	0.922485	0.104931
9993	0.922484	0.104931
9994	0.922484	0.104931
9995	0.922483	0.104931
9996	0.922482	0.104931
9997	0.922481	0.104931
9998	0.922481	0.104931
9999	0.922480	0.104932

⛰️ Lecture 13 – Data 100, Spring 2025¶

Fiting our Multiple Linear Regression Model¶

OLS Approach 1: Use Solution to Normal Equation¶

Make Predictions¶

Using sklearn to fit our Multiple Linear Regression Model¶

Visualize the Fit¶

Bonus Demo¶

Minimizing an Arbitrary 1D Function¶

The Naive Approach: Guess and Check¶

scipy.optimize.minimize¶

Descending with Derivatives¶

Manually Descending with Derivatives¶

Starting with an initial guess of 4.0 and taking 10 steps:¶

Visualizing the optimization steps¶

The Gradient Descent Algorithm¶

Gradient Descent on a 1D Model¶

Gradient Descent on Multi-Dimensional Models¶

Defining a 2D MSE Function¶

Applying Gradient Descent in 2D¶

Stochastic Gradient Descent on Multi-Dimensional Models¶

Using `sklearn` to fit our Multiple Linear Regression Model¶

`scipy.optimize.minimize`¶