import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import numpy as np
# pd.options.mode.chained_assignment = None  # default='warn'

df = sns.load_dataset("penguins")
df = df[df["species"] == "Adelie"].dropna()
df

df = sns.load_dataset("penguins")
df = df[df["species"] == "Adelie"].dropna()
df = df[["bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
df

X = df[["flipper_length_mm", "body_mass_g"]]
X["bias"] = 1
X

y = df["bill_depth_mm"]
y

0      18.7
1      17.4
2      18.0
4      19.3
5      20.6
       ... 
147    18.4
148    17.8
149    18.1
150    17.1
151    18.5
Name: bill_depth_mm, Length: 146, dtype: float64

theta_using_normal_equation = np.linalg.inv(X.T @ X) @ X.T @ y
theta_using_normal_equation

0     0.009828
1     0.001477
2    11.002995
dtype: float64

np.linalg.solve(X.T @ X, X.T @ y)

array([9.82848689e-03, 1.47749591e-03, 1.10029953e+01])

theta_using_normal_equation =  np.linalg.inv(X.T @ X) @ X.T @ y

df["pred_bill_depth_mm"] = X.to_numpy() @ theta_using_normal_equation
df

from sklearn.linear_model import LinearRegression

from sklearn.linear_model import LinearRegression
model = LinearRegression()
model

LinearRegression()

LinearRegression()

model.fit(
    X=df[["flipper_length_mm", "body_mass_g"]], 
    y=df["bill_depth_mm"])

LinearRegression()

LinearRegression()

model.predict([[185, 3750.0]]) # why the double brackets?

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/base.py:439: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

array([18.36187501])

df["sklearn_preds"] = model.predict(df[["flipper_length_mm", "body_mass_g"]])
df

model.intercept_      # why is this a scalar?

11.002995277447074

model.coef_           # why is this an array?

array([0.00982849, 0.0014775 ])

# vs. analytical solutions
theta_using_normal_equation

0     0.009828
1     0.001477
2    11.002995
dtype: float64

fig = px.scatter_3d(df, x="flipper_length_mm", y="body_mass_g", z="bill_depth_mm")

# Create a grid of points to evaluate plane
grid_resolution = 2
(u,v) = np.meshgrid(
    np.linspace(df["flipper_length_mm"].min(), df["flipper_length_mm"].max(), grid_resolution),
    np.linspace(df["body_mass_g"].min(), df["body_mass_g"].max(), grid_resolution))
features = pd.DataFrame({"flipper_length_mm": u.flatten(),
                         "body_mass_g": v.flatten()})
# Make predictions at every point on the grid
zs = model.predict(features)

# create the Surface
fig.add_trace(go.Surface(x=u, y=v, z= zs.reshape(u.shape), opacity=0.9, showscale=False))
fig.update_layout(autosize=False, width=800, height=600)

from sklearn.metrics import  mean_squared_error
mean_squared_error(df["bill_depth_mm"], df["sklearn_preds"])

0.9764070438843998

from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor()

tree_model.fit(
    X=df[["flipper_length_mm", "body_mass_g"]], 
    y=df["bill_depth_mm"])

DecisionTreeRegressor()

DecisionTreeRegressor()

df["sklearn_dt_preds"] = tree_model.predict(df[["flipper_length_mm", "body_mass_g"]])

mean_squared_error(df["bill_depth_mm"], df["sklearn_dt_preds"])

0.051107305936073065

fig = px.scatter_3d(df, x="flipper_length_mm", y="body_mass_g", z="bill_depth_mm")

# Create a grid of points to evaluate plane
grid_resolution = 20
(u,v) = np.meshgrid(
    np.linspace(df["flipper_length_mm"].min(), df["flipper_length_mm"].max(), grid_resolution),
    np.linspace(df["body_mass_g"].min(), df["body_mass_g"].max(), grid_resolution))
features = pd.DataFrame({"flipper_length_mm": u.flatten(),
                         "body_mass_g": v.flatten()})

# Make predictions at every point on the grid
zs = tree_model.predict(features) #<------------------ Only change

# create the Surface
fig.add_trace(go.Surface(x=u, y=v, z= zs.reshape(u.shape), opacity=0.9, showscale=False))
fig.update_layout(autosize=False, width=800, height=600)

def f(x):
    return (x**4 - 15*x**3 + 80*x**2 - 180*x + 144)/10

x = np.linspace(1, 6.75, 200)
fig = px.line(y = f(x), x = x)
fig.update_layout(font_size = 16)
fig.update_layout(autosize=False, width=800, height=600)

def simple_minimize(f, xs):
    y = [f(x) for x in xs]
    return xs[np.argmin(y)]

guesses = [5.3, 5.31, 5.32, 5.33, 5.34, 5.35]
simple_minimize(f, guesses)

5.33

xs = np.linspace(1, 7, 200)
sparse_xs = np.linspace(1.5, 6.5, 5)

ys = f(xs)
sparse_ys = f(sparse_xs)

fig = px.line(x = xs, y = f(xs))
fig.add_scatter(x = sparse_xs, y = f(sparse_xs), mode = "markers", marker_size=16)
fig.update_layout(showlegend= False)
fig.update_layout(autosize=False, width=800, height=600)
fig.show()

from scipy.optimize import minimize

minimize(f, x0 = 3.5)

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: -0.13827491292945523
        x: [ 2.393e+00]
      nit: 3
      jac: [ 6.484e-06]
 hess_inv: [[ 7.385e-01]]
     nfev: 20
     njev: 10

def grad_f(x):
    return (1/10) * (4*x**3 - 45*x**2 + 160*x - 180)

f_line = go.Scatter(x = xs, y = f(xs), mode = "lines", name = "f")
derivative_line = go.Scatter(x = xs, y = grad_f(xs), 
                             mode = "lines", name = "df", line = {"dash": "dash"})
roots = np.array([2.3927, 3.5309, 5.3263]) # computed using algorithm
root_markers = go.Scatter(x = np.array(roots), y = 0*roots, 
                         mode = "markers", name = "df = zero", marker_size = 12)

fig = go.Figure()
fig.add_traces([f_line, derivative_line, root_markers])
fig.update_layout(font_size = 20, yaxis_range=[-1, 3])
fig.update_layout(autosize=False, width=800, height=600)
fig.show()

x = 4.3
fig = go.Figure()
fig.add_trace(f_line)
# Adding a red arrow in the direction of the gradient.
#  Note the arrow is just a direction along the x dimension 
#  (the y position is just for illustrative purposes)
fig.add_trace(go.Scatter(
    x=[x, x - grad_f(x)], y=[f(x), f(x)],  
    marker= dict(size=10,symbol= "arrow-bar-up", angleref="previous"),
    name="Negative Gradient"
    ))
# Add the Green circle for our guess
fig.add_trace(go.Scatter(x=[x],y=[f(x)], 
                         marker_color="green", marker_size=12,
                         mode="markers", name="x0"))
fig.update_layout(font_size = 20, yaxis_range=[-1, 3])
fig.update_layout(autosize=False, width=800, height=600)
fig

def take_one_step(x, derivative):
    # Find our new guess using the recurrence relation
    new_x = x - derivative(x)
    return new_x

x = 4.0
steps = [x]
for i in range(10):
    x = take_one_step(x, grad_f)
    steps.append(x) 

print(steps)

[4.0, 4.4, 5.0464000000000055, 5.496730601062393, 5.0808624852305115, 5.489980392167775, 5.092824872119241, 5.486755386070718, 5.0984728528436225, 5.485072693208349, 5.101402551267881]

# This code is out-of-scope for data-100 but could be fun to learn.
def plot_steps(steps, f = f, f_line = f_line):
    fig = go.Figure()
    fig.add_trace(f_line)
    fig.add_trace(go.Scatter(x = steps, y = [f(s) for s in steps], 
                             mode = "lines+markers", line = {"dash": "dash", "color": "red"},
                             name = "Path", 
                             marker_symbol="arrow",
                             marker_angleref="previous",
                             marker_standoff=4,
                             marker_size = 16))
    fig.add_trace(go.Scatter(x = steps, y = [f(s) for s in steps], 
                             mode = "markers", 
                             name = "Path",
                             marker_color="red",
                             showlegend=False,
                             marker_size = 8))
    fig.update_layout(font_size = 20)
    fig.update_layout(autosize=False, width=800, height=600)

    return fig

plot_steps(steps)

def take_one_step_lr(x, alpha, derivative):
    # Find our new guess using the recurrence relation
    new_x = x - alpha * derivative(x)
    return new_x

x = 4.0
steps = [x]
for i in range(15):
    x = take_one_step_lr(x, alpha=0.3, derivative = grad_f)
    print(x)
    steps.append(x) 

plot_steps(steps)

4.12
4.267296639999997
4.442725838159953
4.640926244829146
4.846183704850335
5.032118544823421
5.17201478493924
5.2564844894138165
5.297911492494514
5.315427176589101
5.322260602055931
5.324832983472768
5.325787650752968
5.3261400404400865
5.326269854338316

def gradient_descent(grad_f, initial_guess, alpha, n):
    """Performs n steps of gradient descent on df using learning rate alpha starting
       from initial_guess. Returns a numpy array of all guesses over time."""
    guesses = [initial_guess]
    current_guess = initial_guess
    while len(guesses) < n:
        current_guess = current_guess - alpha * grad_f(current_guess)
        guesses.append(current_guess)
        
    return np.array(guesses)

trajectory = gradient_descent(grad_f, 1.6, 0.75, 20)
print(trajectory)
plot_steps(trajectory)

[1.6        3.3112     3.18920918 3.01472352 2.79207742 2.56776716
 2.42826486 2.39421613 2.39274816 2.39274798 2.39274798 2.39274798
 2.39274798 2.39274798 2.39274798 2.39274798 2.39274798 2.39274798
 2.39274798 2.39274798]

trajectory = gradient_descent(grad_f, 6, 0.75, 20)
print(trajectory)
plot_steps(trajectory)

[6.         4.2        4.6086     5.12279483 5.38817984 5.28497822
 5.34793725 5.31315502 5.33375146 5.32197109 5.32885604 5.32488006
 5.32719254 5.32585303 5.32663079 5.32617982 5.32644152 5.32628973
 5.32637779 5.32632671]

df = sns.load_dataset("tips")
df.head()

x = df["total_bill"]
y_obs = df["tip"]

def mse_single_arg(theta_1):
    """Returns the MSE on our data for the given theta1"""
    y_hat = theta_1 * x
    return np.mean((y_hat - y_obs) ** 2) 

thetas = np.linspace(-1.5, 1, 100)

mse_line = go.Scatter(x = thetas, y = [mse_single_arg(theta_1) for theta_1 in thetas], mode = "lines", name = "MSE")
fig = go.Figure()
fig.add_trace(mse_line)
fig.update_layout(autosize=False, width=800, height=600, xaxis_title="theta_1", yaxis_title="MSE")

def grad_mse_single_arg(theta_1):
    """Returns the derivative of the MSE on our data for the given theta1"""
    y_hat = theta_1 * x
    return np.mean(-2 * (y_obs - y_hat) * x)

trajectory = gradient_descent(grad_mse_single_arg, -0.5, 0.0001, 100)
print(f"Final guess for theta_1: {trajectory[-1]}")
plot_steps(trajectory,  mse_single_arg,  mse_line)

Final guess for theta_1: 0.14369554654231262

tips_with_bias = df.copy()
tips_with_bias["bias"] = 1
tips_with_bias = tips_with_bias[["bias", "total_bill"]]
tips_with_bias.head()

X = tips_with_bias
y = df["tip"]

def mse_loss(theta):
    y_hat = X @ theta
    return np.mean((y - y_hat) ** 2)

import plotly.graph_objects as go

uvalues = np.linspace(-1, 5, 20)
vvalues = np.linspace(-0.1, 0.35, 20)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))


MSE = np.array([mse_loss(t) for t in thetas.T])

loss_surface = go.Surface(x=u, 
    y=v, z=np.reshape(MSE, u.shape),
    contours = {"z": {"show": True, "start": 0, "end": 50, "size": 2, "color": "white"}})

# This is an approximate guess for the optimal point.
ind = np.argmin(MSE)
optimal_point = go.Scatter3d(name = "Optimal Point",
    x = [thetas.T[ind,0]], y = [thetas.T[ind,1]], 
    z = [MSE[ind]],
    marker=dict(size=10, color="red"))

fig = go.Figure(data=[loss_surface, optimal_point])
fig.update_layout(scene = dict(
    xaxis_title = "theta0",
    yaxis_title = "theta1",
    zaxis_title = "MSE"), autosize=False, width=800, height=600)

fig.show()

contour = go.Contour(x=u[0], y=v[:, 0], z=np.reshape(MSE, u.shape), 
                     contours=dict(start=0, end=70,size=2))
fig = go.Figure(contour)
fig.update_layout(
    xaxis_title = "theta0",
    yaxis_title = "theta1", autosize=False, width=800, height=600)

fig.show()

def mse_gradient(theta):
    """Returns the gradient of the MSE on our data for the given theta"""
    #x0 = X.iloc[:, 0] constant term (1)
    x1 = X.iloc[:, 1]
    dth0 = np.mean(-2 * (y - (theta[0] + theta[1]*x1)))
    dth1 = np.mean(-2 * (y - (theta[0] + theta[1]*x1)) * x1)
    return np.array([dth0, dth1])

guesses = gradient_descent(mse_gradient, np.array([1, .5]), 0.001, 10000)

pd.DataFrame(guesses, columns=["theta_0", "theta_1"]).tail(10)

minimize(mse_loss, x0 = [0,0])

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 1.0360194420115247
        x: [ 9.203e-01  1.050e-01]
      nit: 3
      jac: [ 1.490e-08 -1.490e-08]
 hess_inv: [[ 2.980e+00 -1.253e-01]
            [-1.253e-01  6.335e-03]]
     nfev: 15
     njev: 5

minimize(mse_loss, x0 = [0,0],jac=mse_gradient)

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 1.036019442011377
        x: [ 9.203e-01  1.050e-01]
      nit: 3
      jac: [ 3.494e-16 -6.989e-16]
 hess_inv: [[ 2.980e+00 -1.253e-01]
            [-1.253e-01  6.335e-03]]
     nfev: 5
     njev: 5

def mse_gradient(theta, X, y):
    """Returns the gradient of the MSE on our data for the given theta"""
    x0 = X.iloc[:, 0]
    x1 = X.iloc[:, 1]
    dth0 = np.mean(-2 * (y - theta[0]*x0 - theta[1]*x1) * x0)
    dth1 = np.mean(-2 * (y - theta[0]*x0 - theta[1]*x1) * x1)
    return np.array([dth0, dth1])

def sgd(grad, X, y, initial_theta, eta = 0.3, max_iter = 5000, batch_size=50 ):
    theta = initial_theta
    thetas = [theta]
    n = len(X)
    for t in range(1, max_iter):
        X_sample = X.sample(batch_size)
        y_sample = y.loc[X_sample.index]
        theta = theta - eta/t * grad(theta, X_sample, y_sample)
        thetas.append(theta)
    return thetas

thetas = sgd(mse_gradient, X, y, 
             initial_theta = np.array([1, .5]), 
             eta = 0.001, 
             max_iter = 10000,
             batch_size=1)
thetas[-5:]

[array([1.03058486, 0.10140384]),
 array([1.0305848 , 0.10140215]),
 array([1.03058478, 0.10140173]),
 array([1.03058465, 0.10140063]),
 array([1.03058457, 0.10139992])]

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
4	Adelie	Torgersen	36.7	19.3	193.0	3450.0	Female
5	Adelie	Torgersen	39.3	20.6	190.0	3650.0	Male
...	...	...	...	...	...	...	...
147	Adelie	Dream	36.6	18.4	184.0	3475.0	Female
148	Adelie	Dream	36.0	17.8	195.0	3450.0	Female
149	Adelie	Dream	37.8	18.1	193.0	3750.0	Male
150	Adelie	Dream	36.0	17.1	187.0	3700.0	Female
151	Adelie	Dream	41.5	18.5	201.0	4000.0	Male

	flipper_length_mm	body_mass_g	bias
0	181.0	3750.0	1
1	186.0	3800.0	1
2	195.0	3250.0	1
4	193.0	3450.0	1
5	190.0	3650.0	1
...	...	...	...
147	184.0	3475.0	1
148	195.0	3450.0	1
149	193.0	3750.0	1
150	187.0	3700.0	1
151	201.0	4000.0	1

	bill_depth_mm	flipper_length_mm	body_mass_g	pred_bill_depth_mm
0	18.7	181.0	3750.0	18.322561
1	17.4	186.0	3800.0	18.445578
2	18.0	195.0	3250.0	17.721412
4	19.3	193.0	3450.0	17.997254
5	20.6	190.0	3650.0	18.263268
...	...	...	...	...
147	18.4	184.0	3475.0	17.945735
148	17.8	195.0	3450.0	18.016911
149	18.1	193.0	3750.0	18.440503
150	17.1	187.0	3700.0	18.307657
151	18.5	201.0	4000.0	18.888505

	bill_depth_mm	flipper_length_mm	body_mass_g	pred_bill_depth_mm	sklearn_preds
0	18.7	181.0	3750.0	18.322561	18.322561
1	17.4	186.0	3800.0	18.445578	18.445578
2	18.0	195.0	3250.0	17.721412	17.721412
4	19.3	193.0	3450.0	17.997254	17.997254
5	20.6	190.0	3650.0	18.263268	18.263268
...	...	...	...	...	...
147	18.4	184.0	3475.0	17.945735	17.945735
148	17.8	195.0	3450.0	18.016911	18.016911
149	18.1	193.0	3750.0	18.440503	18.440503
150	17.1	187.0	3700.0	18.307657	18.307657
151	18.5	201.0	4000.0	18.888505	18.888505

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Lecture 13 – Data 100, Fall 2024¶

Using scikit-learn to fit our Multiple Linear Regression Model¶

OLS Approach 1: Use Solution to Normal Equation¶

Make Predictions¶

Using SKLearn to fit our Multiple Linear Regression Model¶

Visualize the Fit¶

Minimizing an Arbitrary 1D Function¶

The Naive Approach: Guess and Check¶

`scipy.optimize.minimize`¶

Gradient Descent¶

Manually Descending the Gradient¶

Starting with an initial guess of 4.0 and taking 10 steps:¶

Visualizing the optimization steps¶

The Gradient Descent Algorithm¶

Gradient Descent on a 1D Model¶

Gradient Descent on Multi-Dimensional Models¶

Defining a 2D MSE Function¶

Applying Gradient Descent in 2D¶

Stochastic Gradient Descent on Multi-Dimensional Models¶

	theta_0	theta_1
9990	0.922487	0.104931
9991	0.922486	0.104931
9992	0.922485	0.104931
9993	0.922484	0.104931
9994	0.922484	0.104931
9995	0.922483	0.104931
9996	0.922482	0.104931
9997	0.922481	0.104931
9998	0.922481	0.104931
9999	0.922480	0.104932

Lecture 13 – Data 100, Fall 2024¶

Using scikit-learn to fit our Multiple Linear Regression Model¶

OLS Approach 1: Use Solution to Normal Equation¶

Make Predictions¶

Using SKLearn to fit our Multiple Linear Regression Model¶

Visualize the Fit¶

Minimizing an Arbitrary 1D Function¶

The Naive Approach: Guess and Check¶

scipy.optimize.minimize¶

Gradient Descent¶

Manually Descending the Gradient¶

Starting with an initial guess of 4.0 and taking 10 steps:¶

Visualizing the optimization steps¶

The Gradient Descent Algorithm¶

Gradient Descent on a 1D Model¶

Gradient Descent on Multi-Dimensional Models¶

Defining a 2D MSE Function¶

Applying Gradient Descent in 2D¶

Stochastic Gradient Descent on Multi-Dimensional Models¶

`scipy.optimize.minimize`¶