import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
pd.options.mode.chained_assignment = None # default='warn'
Consider the penguins
dataset.
df = sns.load_dataset("penguins")
df = df[df["species"] == "Adelie"].dropna()
df
species | island | bill_length_mm | bill_depth_mm | flipper_length_mm | body_mass_g | sex | |
---|---|---|---|---|---|---|---|
0 | Adelie | Torgersen | 39.1 | 18.7 | 181.0 | 3750.0 | Male |
1 | Adelie | Torgersen | 39.5 | 17.4 | 186.0 | 3800.0 | Female |
2 | Adelie | Torgersen | 40.3 | 18.0 | 195.0 | 3250.0 | Female |
4 | Adelie | Torgersen | 36.7 | 19.3 | 193.0 | 3450.0 | Female |
5 | Adelie | Torgersen | 39.3 | 20.6 | 190.0 | 3650.0 | Male |
... | ... | ... | ... | ... | ... | ... | ... |
147 | Adelie | Dream | 36.6 | 18.4 | 184.0 | 3475.0 | Female |
148 | Adelie | Dream | 36.0 | 17.8 | 195.0 | 3450.0 | Female |
149 | Adelie | Dream | 37.8 | 18.1 | 193.0 | 3750.0 | Male |
150 | Adelie | Dream | 36.0 | 17.1 | 187.0 | 3700.0 | Female |
151 | Adelie | Dream | 41.5 | 18.5 | 201.0 | 4000.0 | Male |
146 rows × 7 columns
Suppose we could measure flippers and weight easily, but not bill dimensions. How can we predict bill depth from flipper length and/or body mass?
For demo purposes, we'll drop all columns except the variables whose relationships we're interested in modeling.
df = df[["bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
df
bill_depth_mm | flipper_length_mm | body_mass_g | |
---|---|---|---|
0 | 18.7 | 181.0 | 3750.0 |
1 | 17.4 | 186.0 | 3800.0 |
2 | 18.0 | 195.0 | 3250.0 |
4 | 19.3 | 193.0 | 3450.0 |
5 | 20.6 | 190.0 | 3650.0 |
... | ... | ... | ... |
147 | 18.4 | 184.0 | 3475.0 |
148 | 17.8 | 195.0 | 3450.0 |
149 | 18.1 | 193.0 | 3750.0 |
150 | 17.1 | 187.0 | 3700.0 |
151 | 18.5 | 201.0 | 4000.0 |
146 rows × 3 columns
Suppose we want to create a linear regression model that predicts a penguin's bill depth $y$ using just their flipper length $x$ (plus an intercept term).
Based on our EDA, there is a linear relationship, though it is somewhat weak.
px.scatter(df, x = "flipper_length_mm", y = "bill_depth_mm")
Let $\hat{\theta}_0$ and $\hat{\theta}_1$ be the choices that minimize the Mean Squared Error.
One approach to compute $\hat{\theta}_0$ and $\hat{\theta}_1$ is analytically, using the equations we derived in a previous lecture:
$$\hat{\theta}_0 = \bar{y} - \hat{\theta}_1 \bar{x}$$$$\hat{\theta}_1 = r \frac{\sigma_y}{\sigma_x}$$$$r = \frac{1}{n} \sum_{i=1}^{n}\left( \frac{x_i - x}{\sigma_x} \right) \left( \frac{y_i - y}{\sigma_y} \right) $$x = df["flipper_length_mm"]
y = df["bill_depth_mm"]
x_bar, sigma_x = np.mean(x), np.std(x)
y_bar, sigma_y = np.mean(y), np.std(y)
r = np.sum((x - x_bar) / sigma_x * (y - y_bar) / sigma_y) / len(x)
theta1_hat = r * sigma_y / sigma_x
theta1_hat
0.05812622369506766
theta0_hat = y_bar - theta1_hat * x_bar
theta0_hat
7.297305899612306
df["pred_bill_depth_mm"] = theta0_hat + theta1_hat * df["flipper_length_mm"]
df["residual"] = df["bill_depth_mm"] - df["pred_bill_depth_mm"]
df
bill_depth_mm | flipper_length_mm | body_mass_g | pred_bill_depth_mm | residual | |
---|---|---|---|---|---|
0 | 18.7 | 181.0 | 3750.0 | 17.818152 | 0.881848 |
1 | 17.4 | 186.0 | 3800.0 | 18.108784 | -0.708784 |
2 | 18.0 | 195.0 | 3250.0 | 18.631920 | -0.631920 |
4 | 19.3 | 193.0 | 3450.0 | 18.515667 | 0.784333 |
5 | 20.6 | 190.0 | 3650.0 | 18.341288 | 2.258712 |
... | ... | ... | ... | ... | ... |
147 | 18.4 | 184.0 | 3475.0 | 17.992531 | 0.407469 |
148 | 17.8 | 195.0 | 3450.0 | 18.631920 | -0.831920 |
149 | 18.1 | 193.0 | 3750.0 | 18.515667 | -0.415667 |
150 | 17.1 | 187.0 | 3700.0 | 18.166910 | -1.066910 |
151 | 18.5 | 201.0 | 4000.0 | 18.980677 | -0.480677 |
146 rows × 5 columns
Mean Squared Error: We can compute this explicitly by averaging the square of the residuals $e_i$:
$$\large MSE = \sum_{i=1}^n (y_i - \hat{y}_i)^2 = \frac{1}{n}\sum_{i=1}^n (e_i)^2 = \frac{1}{n}$$np.mean(df["residual"]**2)
1.3338778799806374
We can also visualize the output of the model:
sns.scatterplot(data = df, x = "flipper_length_mm", y = "bill_depth_mm")
plt.plot(df["flipper_length_mm"], df["pred_bill_depth_mm"], 'r') # a line
plt.legend([r'$\hat{y}$', '$y$']);
Or in plotly:
import plotly.graph_objects as go
scatter_plot = px.scatter(df, x="flipper_length_mm", y="bill_depth_mm")
line_plot = px.line(df, x="flipper_length_mm", y="pred_bill_depth_mm")
line_plot.update_traces(line=dict(color = 'rgba(255, 0, 0)'))
go.Figure(data=scatter_plot.data + line_plot.data)
An alternate approach to optimize our model is to use the sklearn.linear_model.LinearRegression
class.
from sklearn.linear_model import LinearRegression
Create an sklearn
object.
First we create a model. At this point it is like a baby, knowing nothing.
from sklearn.linear_model import LinearRegression
model = LinearRegression()
fit
the object to data.
Then we "fit" the model, which means computing the parameters that minimize the loss function. The LinearRegression
class is hard coded to use the MSE as its loss function. The first argument of the fit function should be a matrix (or DataFrame), and the second should be the observations we're trying to predict.
Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
X = ...
y = df["bill_depth_mm"]
model.fit(X, y)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /tmp/ipykernel_1213/773076018.py in <module> 1 X = ... 2 y = df["bill_depth_mm"] ----> 3 model.fit(X, y) /srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/linear_model/_base.py in fit(self, X, y, sample_weight) 682 accept_sparse = False if self.positive else ["csr", "csc", "coo"] 683 --> 684 X, y = self._validate_data( 685 X, y, accept_sparse=accept_sparse, y_numeric=True, multi_output=True 686 ) /srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 594 y = check_array(y, input_name="y", **check_y_params) 595 else: --> 596 X, y = check_X_y(X, y, **check_params) 597 out = X, y 598 /srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 1072 ) 1073 -> 1074 X = check_array( 1075 X, 1076 accept_sparse=accept_sparse, /srv/conda/envs/notebook/lib/python3.9/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name) 869 # If input is scalar raise error 870 if array.ndim == 0: --> 871 raise ValueError( 872 "Expected 2D array, got scalar array instead:\narray={}.\n" 873 "Reshape your data either using array.reshape(-1, 1) if " ValueError: Expected 2D array, got scalar array instead: array=Ellipsis. Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Analyze fit or call predict
.
Now that our model is trained, we can ask it questions. The code below asks the model to estimate the bill depth (in mm) for a penguin with a 185 mm flipper length.
model.predict([[185]]) # why the double brackets?
We can also ask the model to generate a series of predictions:
df["sklearn_preds"] = model.predict(df[["flipper_length_mm"]])
df
Looking at the predictions, we see that they are exactly the same as we got using our analytic formula!!
Analyze parameters.
We can ask the model for its intercept and slope with _intercept
and _coef
, respectively.
model.intercept_ # why is this a scalar?
model.coef_ # why is this an array?
They are the same as with our analytic formula.
# vs. analytical solutions
theta0_hat, theta1_hat
Analyze performance.
The sklearn
package also provides a function that computes the MSE from a list of observations and predictions. This avoids us having to manually compute MSE by first computing residuals.
from sklearn.metrics import mean_squared_error
mean_squared_error(df["bill_depth_mm"], df["sklearn_preds"])
Let's start by reloading in our data to remove the columns we added earlier.
df = sns.load_dataset("penguins")
df = df[df["species"] == "Adelie"].dropna()
df = df[["bill_depth_mm", "flipper_length_mm", "body_mass_g"]]
df
Suppose we want to create a linear regression model that predicts a penguin's bill depth $y$ using both their flipper length $x_1$ and body mass $x_2$, plus an intercept term.
We saw in last lecture that we can model the above multiple linear regression equation using matrix multiplication:
$$ \large \hat{\mathbb{Y}} = \mathbb{X} \theta$$The optimal $\hat{\theta}$ that minimizes MSE also solves the normal equation:
$$ \left( \mathbb{X}^T \mathbb{X} \right) \hat{\theta} = \mathbb{X}^T \mathbb{Y}$$If $\mathbb{X}$ is full column rank, then there is a unique solution to $\hat{\theta}$
$$\large \hat{\theta} = \left( \mathbb{X}^T \mathbb{X} \right)^{-1} \mathbb{X}^T \mathbb{Y}$$Note that this derivation is one of the most challenging in the course, especially for those of you who are learning linear algebra during the same semester.
Let's try this in code. We will add a bias term both so we actually represent the linear equation we proposed (with intercept), as well as so that our residuals sum to zero.
X = df[["flipper_length_mm", "body_mass_g"]]
X["bias"] = 1
X
y = df["bill_depth_mm"]
y
Recall the solution to the normal equation if $\mathbb{X}$ is full column rank:
$$\hat{\theta} = \left( \mathbb{X}^T \mathbb{X} \right)^{-1} \mathbb{X}^T \mathbb{Y}$$theta_using_normal_equation = np.linalg.inv(X.T @ X) @ X.T @ y
theta_using_normal_equation.values
Note: The code above is inefficient. We won't go into this in our class, but it's better to use np.linalg.solve
rather than computing the explicit matrix inverse.
This also doesn't work if our X is not invertible. You will explore this in lab.
Predictions: We'll save the predictions in a column so we can compare them against using sklearn.
df["pred_bill_depth_mm"] = X.to_numpy() @ theta_using_normal_equation
df
We can actually compute the optimal parameters very easily using sklearn, using the exact code that we wrote earlier.
Note: sklearn does NOT use the normal equation, instead it uses a procedure called gradient descent which can minimize ANY function, not just the MSE. The rest of the lecture will explain how gradient descent works.
# creating our design matrix.
# Note:
# - no bias term needed here bc of sklearn's special functionality
# - now the double-brackets make sense!!
X_2d = df[["flipper_length_mm", "body_mass_g"]]
y = df["bill_depth_mm"]
model_2d = LinearRegression() # note fit_intercept=True by default
model_2d.fit(X_2d, y)
Now that we have this model, we can make predictions. For example, we can ask our model about a penguin's bill depth if they have 185-mm flipper length and 3750 g body mass.
df["sklearn_predictions"] = model_2d.predict(X_2d)
model_2d.predict([[185, 3750]]) # why double brackets?
Note that this prediction is slightly ifferent than our first 1-D model (flipper length only):
model.predict([[185]]) # why double brackets?
Compare analytical and sklearn predictions
df
Get coefficients for sklearn
$$\hat{y} = \theta_0 + \theta_1 x_1 + \theta_2 x_2$$We can get $\theta_0$ with intercept_
and $\theta_1$ and $\theta_2$ with coef_
:
print(f"theta0: {model_2d.intercept_}")
print(f"theta1 and theta2: {model_2d.coef_}")
theta_using_sklearn = [model_2d.intercept_, model_2d.coef_[0], model_2d.coef_[1]]
theta_using_sklearn
theta_using_normal_equation
They look the same, but why are they out of order...?
Remember our order of columns used for normal equation!
X
from mpl_toolkits.mplot3d import Axes3D # noqa: F401 unused import
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df["flipper_length_mm"], df["body_mass_g"], df["bill_depth_mm"])
plt.xlabel('flipper_length_mm')
plt.ylabel('body_mass_g')
df[["flipper_length_mm", "body_mass_g"]]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df["flipper_length_mm"], df["body_mass_g"], df["bill_depth_mm"])
xx, yy = np.meshgrid(range(170, 220, 10), range(2500, 4500, 100))
zz = ( 11.0029 + 0.00982 * xx + 0.001477 * yy) # thetas_using_sklearn
ax.plot_surface(xx, yy, zz, alpha=0.2)
plt.xlabel('flipper_length_mm')
plt.ylabel('body_mass_g')
plt.gcf().savefig("plane.png", dpi = 300, bbox_inches = "tight")
We see that the predictions all lie in a plane. In higher dimensions, they all lie in a "hyperplane".
Suppose we want to find the minimum of the arbitrary function given below:
def arbitrary(x):
return (x**4 - 15*x**3 + 80*x**2 - 180*x + 144)/10
x = np.linspace(1, 6.75, 200)
fig = px.line(y = arbitrary(x), x = x)
fig.update_layout(font_size = 16)
One way to minimize this mathematical function is to use the scipy.optimize.minimize
function. It takes a function and a starting guess and tries to find the minimum.
from scipy.optimize import minimize
minimize(arbitrary, x0 = 3.5)
fun: -0.13827491292966557 hess_inv: array([[0.73848255]]) jac: array([6.48573041e-06]) message: 'Optimization terminated successfully.' nfev: 20 nit: 3 njev: 10 status: 0 success: True x: array([2.39275266])
Our choice of start point can affect the outcome. For example if we start to the left, we get stuck in the local minimum on the left side.
minimize(arbitrary, x0 = 1)
fun: -0.13827491294422317 hess_inv: array([[0.74751575]]) jac: array([-3.7997961e-07]) message: 'Optimization terminated successfully.' nfev: 16 nit: 7 njev: 8 status: 0 success: True x: array([2.3927478])
scipy.optimize.minimize
is great. It may also seem a bit magical. How could you write a function that can find the minimum of any mathematical function? There are a number of ways to do this, which we'll explore in today's lecture, eventually arriving at the important idea of gradient descent, which is the principle that scipy.optimize.minimize
uses.
It turns out that under the hood, the fit
method for LinearRegression
models uses gradient descent. Gradient descent is also how much of machine learning works, including even advanced neural network models.
In DS100, the gradient descent process will usually be invisible to us, hidden beneath an abstraction layer. However, to be good data scientists, it's important that we know the basic principles beyond the optimization functions that harness to find optimal parmaeters.
Above, we saw that the minimum is somewhere around 5.3ish. Let's see if we can figure out how to find the exact minimum algorithmically from scratch.
One way very slow and terrible way would be manual guess-and-check.
arbitrary(6)
0.0
A somewhat better approach is to use brute force to try out a bunch of x values and return the one that yields the lowest loss.
def simple_minimize(f, xs):
y = [f(x) for x in xs]
return xs[np.argmin(y)]
simple_minimize(arbitrary, np.linspace(1, 7, 20))
5.421052631578947
This process is essentially the same as before where we made a graphical plot, it's just that we're only looking at 20 selected points.
xs = np.linspace(1, 7, 200)
sparse_xs = np.linspace(1, 7, 5)
ys = arbitrary(xs)
sparse_ys = arbitrary(sparse_xs)
fig = px.line(x = xs, y = arbitrary(xs))
fig.add_scatter(x = sparse_xs, y = arbitrary(sparse_xs), mode = "markers")
fig.update_layout(showlegend= False)
fig.show()
This basic approach suffers from three major flaws:
Instead of choosing all of our guesses ahead of time, we can instead start from a single guess and try to iteratively improve on our choice.
They key insight is this: If the derivative of the function is negative, that means the function is decreasing, so we should go to the right (i.e. pick a bigger x). If the derivative of the function is positive, that means the function is increasing, so we should go to the left (i.e. pick a smaller x).
Thus, the derivative tells us which way to go.
#desmos demo: https://www.desmos.com/calculator/twpnylu4lr
import plotly.graph_objects as go
def derivative_arbitrary(x):
return (4*x**3 - 45*x**2 + 160*x - 180)/10
fig = go.Figure()
roots = np.array([2.3927, 3.5309, 5.3263])
fig.add_trace(go.Scatter(x = xs, y = arbitrary(xs),
mode = "lines", name = "f"))
fig.add_trace(go.Scatter(x = xs, y = derivative_arbitrary(xs),
mode = "lines", name = "df", line = {"dash": "dash"}))
fig.add_trace(go.Scatter(x = np.array(roots), y = 0*roots,
mode = "markers", name = "df = zero", marker_size = 12))
fig.update_layout(font_size = 20, yaxis_range=[-1, 3])
fig.show()
import matplotlib.pyplot as plt
def plot_arbitrary():
x = np.linspace(1, 7, 100)
plt.plot(x, arbitrary(x))
axes = plt.gca()
axes.set_ylim([-1, 3])
def plot_x_on_f(f, x):
y = f(x)
default_args = dict(label=r'$ \theta $', zorder=2,
s=200, c=sns.xkcd_rgb['green'])
plt.scatter([x], [y], **default_args)
def plot_x_on_f_empty(f, x):
y = f(x)
default_args = dict(label=r'$ \theta $', zorder=2,
s=200, c = 'none', edgecolor=sns.xkcd_rgb['green'])
plt.scatter([x], [y], **default_args)
def plot_tangent_on_f(f, x, eps=1e-6):
slope = ((f(x + eps) - f(x - eps))
/ (2 * eps))
xs = np.arange(x - 1, x + 1, 0.05)
ys = f(x) + slope * (xs - x)
plt.plot(xs, ys, zorder=3, c=sns.xkcd_rgb['green'], linestyle='--')
plot_arbitrary()
plot_x_on_f(arbitrary, 2)
plot_tangent_on_f(arbitrary, 2)
plt.xlabel('x')
plt.ylabel('f(x)');
plot_arbitrary()
plot_x_on_f(arbitrary, 4.4)
plot_tangent_on_f(arbitrary, 4.4)
Armed with this knowledge, let's try to see if we can use the derivative to optimize the function.
guess = 4
print(f"x: {guess}, f(x): {arbitrary(guess)}, derivative f'(x): {derivative_arbitrary(guess)}")
x: 4, f(x): 0.0, derivative f'(x): -0.4
guess = 4 + 0.4
print(f"x: {guess}, f(x): {arbitrary(guess)}, derivative f'(x): {derivative_arbitrary(guess)}")
x: 4.4, f(x): -0.21504000000003315, derivative f'(x): -0.6464000000000055
def plot_one_step(x):
new_x = x - derivative_arbitrary(x)
plot_arbitrary()
plot_x_on_f(arbitrary, new_x)
plot_x_on_f_empty(arbitrary, x)
print(f'old x: {x}')
print(f'new x: {new_x}')
plot_one_step(4)
old x: 4 new x: 4.4
plot_one_step(4.4)
old x: 4.4 new x: 5.0464000000000055
plot_one_step(5.0464)
old x: 5.0464 new x: 5.49673060106241
plot_one_step(5.4967)
old x: 5.4967 new x: 5.080917145374805
plot_one_step(5.080917145374805)
old x: 5.080917145374805 new x: 5.489966698640582
plot_one_step(5.489966698640582)
old x: 5.489966698640582 new x: 5.092848945470474
def plot_one_step_better(x):
new_x = x - 0.3 * derivative_arbitrary(x)
plot_arbitrary()
plot_x_on_f(arbitrary, new_x)
plot_x_on_f_empty(arbitrary, x)
print(f'old x: {x}')
print(f'new x: {new_x}')
plot_one_step_better(4)
old x: 4 new x: 4.12
plot_one_step_better(4.12)
old x: 4.12 new x: 4.267296639999997
plot_one_step_better(5.17180969114245)
old x: 5.17180969114245 new x: 5.256374838146257
plot_one_step_better(5.323)
old x: 5.323 new x: 5.325108157959999
Written as a recurrence relation, the process we've described above is:
$$ x^{(t+1)} = x^{(t)} - 0.3 \frac{d}{dx} f(x) $$This algorithm is also known as "gradient descent".
Given a current $x$, gradient descent creates its next guess for $x$ based on the sign and magnitude of the derivative.
Our choice of 0.3 above was totally arbitrary. Naturally, we can generalize by replacing it with a parameter, typically represented by $\alpha$, and often called the learning rate.
We can also write up this procedure in code as given below:
def gradient_descent(df, initial_guess, alpha, n):
"""Performs n steps of gradient descent on df using learning rate alpha starting
from initial_guess. Returns a numpy array of all guesses over time."""
guesses = [initial_guess]
current_guess = initial_guess
while len(guesses) < n:
current_guess = current_guess - alpha * df(current_guess)
guesses.append(current_guess)
return np.array(guesses)
trajectory = gradient_descent(derivative_arbitrary, 4, 0.3, 20)
trajectory
array([4. , 4.12 , 4.26729664, 4.44272584, 4.64092624, 4.8461837 , 5.03211854, 5.17201478, 5.25648449, 5.29791149, 5.31542718, 5.3222606 , 5.32483298, 5.32578765, 5.32614004, 5.32626985, 5.32631764, 5.32633523, 5.3263417 , 5.32634408])
trajectory = gradient_descent(derivative_arbitrary, 4, 1, 20)
trajectory
array([4. , 4.4 , 5.0464 , 5.4967306 , 5.08086249, 5.48998039, 5.09282487, 5.48675539, 5.09847285, 5.48507269, 5.10140255, 5.48415922, 5.10298805, 5.48365325, 5.10386474, 5.48336998, 5.1043551 , 5.48321045, 5.10463112, 5.48312031])
Below, we see a visualization of the trajectory taken by this algorithm.
trajectory = gradient_descent(derivative_arbitrary, 1.6, 0.6, 20)
plot_arbitrary()
plt.plot(trajectory, arbitrary(trajectory));
Above, we've simply run our algorithm a fixed number of times. More sophisticated implementations will stop based on a variety of different stopping criteria, e.g. error getting too small, error getting too large, etc. We will not discuss these in our course.
In the next part, we'll return to the world of real data and see how this procedure might be useful for optimizing models.
Let's consider a case where we have a linear model with no offset.
$$\hat{y} = \theta_1 x$$We want to find the parameter $\theta_1$ such that the L2 loss is minimized. In sklearn, this is easy. To avoid fitting an intercept, we set fit_intercept
to false.
model = LinearRegression(fit_intercept = False)
df = sns.load_dataset("tips")
model.fit(df[["total_bill"]], df["tip"])
model.coef_
array([0.1437319])
The optimal tip percentage is 14.37%.
To employ gradient descent and do this ourselves, we need to define a function upon which we can use gradient descent.
Suppose we select the L2 loss as our loss function. In this case, our goal will be to minimize the mean squared error.
Let's start by writing a function that computes the MSE for a given choice of $\theta_1$ on our dataset.
def mse_loss(theta1, x, y_obs):
y_hat = theta1 * x
return np.mean((y_hat - y_obs) ** 2)
x = df["total_bill"]
y_obs = df["tip"]
mse_loss(0.147, x, y_obs)
1.1831403511422127
Since x
and y_obs
are fixed, the only variable is theta1
.
For clarity, let's define a python function that returns the MSE as a function of a single argument theta1
.
def mse_single_arg(theta1):
"""Returns the MSE on our data for the given theta1"""
x = df["total_bill"]
y_obs = df["tip"]
y_hat = theta1 * x
return np.mean((y_hat - y_obs) ** 2)
mse_single_arg(0.1437)
1.1781165940051925
Thus we can plot the MSE as a function of theta1
. It turns out to look pretty smooth, and quite similar to a parabola.
theta1s = np.linspace(0, 0.2, 200)
x = df["total_bill"]
y_obs = df["tip"]
MSEs = [mse_single_arg(theta1) for theta1 in theta1s]
plt.plot(theta1s, MSEs)
plt.xlabel(r"Choice for $\theta_1$")
plt.ylabel(r"MSE");
The minimum appears to be around $\theta_1 = 0.14$.
Recall our simple_minimize function from earlier, redefined below for your convenience.
def simple_minimize(f, xs):
y = [f(x) for x in xs]
return xs[np.argmin(y)]
simple_minimize(mse_single_arg, np.linspace(0, 0.2, 21))
0.14
As before, what we're doing is computing all the starred values below and then returning the $\theta_1$ that goes with the minimum value.
theta1s = np.linspace(0, 0.2, 200)
sparse_theta1s = np.linspace(0, 0.2, 21)
loss = [mse_single_arg(theta1) for theta1 in theta1s]
sparse_loss = [mse_single_arg(theta1) for theta1 in sparse_theta1s]
plt.plot(theta1s, loss)
plt.plot(sparse_theta1s, sparse_loss, 'r*')
plt.xlabel(r"Choice for $\theta_1$")
plt.ylabel(r"MSE");
import scipy.optimize
from scipy.optimize import minimize
minimize(mse_single_arg, x0 = 0)
fun: 1.178116115451325 hess_inv: array([[1]]) jac: array([3.20374966e-06]) message: 'Optimization terminated successfully.' nfev: 6 nit: 1 njev: 3 status: 0 success: True x: array([0.14373189])
Another approach is to use our 1D gradient descent algorithm from earlier. This is the exact same function as earlier, but copy and pasted to remind you what it looks like.
def gradient_descent(df, initial_guess, alpha, n):
"""Performs n steps of gradient descent on df using learning rate alpha starting
from initial_guess. Returns a numpy array of all guesses over time."""
guesses = [initial_guess]
current_guess = initial_guess
while len(guesses) < n:
current_guess = current_guess - alpha * df(current_guess)
guesses.append(current_guess)
return np.array(guesses)
To use this function, we need to compute the derivative of the MSE. The MSE is repeated below for convenience.
def mse_single_arg(theta1):
"""Returns the MSE on our data for the given theta1"""
x = df["total_bill"]
y_obs = df["tip"]
y_hat = theta1 * x
return np.mean((y_hat - y_obs) ** 2)
So the derivative is therefore:
def mse_loss_derivative_single_arg(theta_1):
"""Returns the derivative of the MSE on our data for the given theta1"""
x = df["total_bill"]
y_obs = df["tip"]
y_hat = theta_1 * x
return np.mean(2 * (y_hat - y_obs) * x)
gradient_descent(mse_loss_derivative_single_arg, 0.05, 0.0001, 100)
array([0.05 , 0.05881852, 0.06680736, 0.0740446 , 0.08060095, 0.08654045, 0.09192116, 0.09679563, 0.10121151, 0.10521192, 0.10883597, 0.11211906, 0.11509327, 0.11778766, 0.12022855, 0.1224398 , 0.12444301, 0.12625776, 0.12790176, 0.1293911 , 0.13074031, 0.13196259, 0.13306988, 0.13407298, 0.13498172, 0.13580495, 0.13655074, 0.13722636, 0.13783841, 0.13839289, 0.13889519, 0.13935024, 0.13976248, 0.14013593, 0.14047425, 0.14078073, 0.14105839, 0.14130992, 0.14153778, 0.14174421, 0.14193122, 0.14210063, 0.1422541 , 0.14239314, 0.14251909, 0.14263319, 0.14273656, 0.1428302 , 0.14291504, 0.14299189, 0.14306151, 0.14312458, 0.14318172, 0.14323348, 0.14328037, 0.14332285, 0.14336134, 0.1433962 , 0.14342778, 0.14345639, 0.14348231, 0.1435058 , 0.14352707, 0.14354634, 0.1435638 , 0.14357961, 0.14359394, 0.14360692, 0.14361868, 0.14362933, 0.14363898, 0.14364772, 0.14365564, 0.14366281, 0.14366931, 0.1436752 , 0.14368053, 0.14368537, 0.14368974, 0.14369371, 0.1436973 , 0.14370056, 0.14370351, 0.14370618, 0.1437086 , 0.14371079, 0.14371277, 0.14371457, 0.1437162 , 0.14371768, 0.14371902, 0.14372023, 0.14372133, 0.14372232, 0.14372322, 0.14372404, 0.14372478, 0.14372545, 0.14372605, 0.1437266 ])
In the context of minimizing loss, we can write out the gradient descent rule for generating the next $\theta_1$ as:
$$ \theta_1^{(t+1)} = \theta_1^{(t)} - \alpha \frac{\partial}{\partial \theta_1} L(\theta^{(t)}, \Bbb{X}, \vec{\hat{y}}) $$Here $L$ is our chosen loss function, $\Bbb{X}$ are the features of our input, and $\vec{\hat{y}}$ are our observations. During the gradient descent algorithm, we treat $\Bbb{X}$ and $\vec{\hat{y}}$ as constants.
Now suppose we improve our model so that we want to predict the tip from the total_bill plus a constant offset, in other words:
$$\textrm{tip} = \theta_0 + \theta_1 \textrm{bill}$$We know how to do this in sklearn:
model = LinearRegression(fit_intercept = True)
X = df[["total_bill"]]
y = df["tip"]
model.fit(X, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
model.coef_
array([0.10502452])
model.intercept_
0.9202696135546731
To make our lives easier, let's reframe the problem slightly:
tips_with_bias = df.copy()
tips_with_bias["bias"] = 1
X = tips_with_bias[["bias", "total_bill"]]
X.head(5)
bias | total_bill | |
---|---|---|
0 | 1 | 16.99 |
1 | 1 | 10.34 |
2 | 1 | 21.01 |
3 | 1 | 23.68 |
4 | 1 | 24.59 |
model = LinearRegression(fit_intercept = False)
X = tips_with_bias[["bias", "total_bill"]]
y = df["tip"]
model.fit(X, y)
LinearRegression(fit_intercept=False)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression(fit_intercept=False)
model.coef_
array([0.92026961, 0.10502452])
Now, we can give our predictions as $$\hat{y} = \theta_0 + \theta_1 \times bill$$
For example, the predictions below correspond to assuming every table leaves a tip of \$1.50 plus 5% of their total bill.
X @ np.array([1.5, 0.05])
0 2.3495 1 2.0170 2 2.5505 3 2.6840 4 2.7295 ... 239 2.9515 240 2.8590 241 2.6335 242 2.3910 243 2.4390 Length: 244, dtype: float64
Throughout this problem, we'll assume we want to minimize the mean squared error of our predictions, i.e.
def mse_loss(theta, X, y_obs):
y_hat = X @ theta
return np.mean((y_hat - y_obs) ** 2)
For example, the loss assuming the model described above is:
mse_loss(np.array([1.5, 0.05]), X, y_obs)
1.5340521752049179
Using this function, we can create a 3D plot. This uses lots of syntax you've never seen.
import plotly.graph_objects as go
uvalues = np.linspace(0, 2, 10)
vvalues = np.linspace(0, 0.2, 10)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
def mse_loss_single_arg(theta):
return mse_loss(theta, X, y_obs)
MSE = np.array([mse_loss_single_arg(t) for t in thetas.T])
loss_surface = go.Surface(x=u, y=v, z=np.reshape(MSE, u.shape))
ind = np.argmin(MSE)
optimal_point = go.Scatter3d(name = "Optimal Point",
x = [thetas.T[ind,0]], y = [thetas.T[ind,1]],
z = [MSE[ind]],
marker=dict(size=10, color="red"))
fig = go.Figure(data=[loss_surface, optimal_point])
fig.update_layout(scene = dict(
xaxis_title = "theta0",
yaxis_title = "theta1",
zaxis_title = "MSE"))
fig.show()
Below is the function we just derived, but in code.
def mse_gradient(theta, X, y_obs):
"""Returns the gradient of the MSE on our data for the given theta"""
x0 = X.iloc[:, 0]
x1 = X.iloc[:, 1]
dth0 = np.mean(-2 * (y_obs - theta[0]*x0 - theta[1]*x1) * x0)
dth1 = np.mean(-2 * (y_obs - theta[0]*x0 - theta[1]*x1) * x1)
return np.array([dth0, dth1])
X = tips_with_bias[["bias", "total_bill"]]
y_obs = tips_with_bias["tip"]
mse_gradient(np.array([0, 0]), X, y_obs)
array([ -5.99655738, -135.22631803])
def mse_gradient_single_arg(theta):
"""Returns the gradient of the MSE on our data for the given theta"""
X = tips_with_bias[["bias", "total_bill"]]
y_obs = tips_with_bias["tip"]
return mse_gradient(theta, X, y_obs)
mse_gradient_single_arg(np.array([0, 0]))
array([ -5.99655738, -135.22631803])
guesses = gradient_descent(mse_gradient_single_arg, np.array([0, 0]), 0.001, 10000)
pd.DataFrame(guesses).tail(10)
If you play around with the code above, you'll see that it's pretty finicky about the start point and learning rate.
For reference, the general matrix form of the gradient is given below. We have not discussed how to derive this in class.
def mse_gradient(theta, X, y_obs):
"""Returns the gradient of the MSE on our data for the given theta"""
n = len(X)
return -2 / n * (X.T @ y_obs - X.T @ X @ theta)