%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import lec_util
In this notebook we work through a detailed complete derivation of gradient descent. We will not use any other packaged beyond basic python tools.
For this simple example we will use the following toy dataset:
(x, y) = lec_util.load_data()
print("Number of data points:", x.shape[0])
Plotted below we have:
px.scatter(x=x, y=y)
This data looks sort of like a sine function. Let's build a model to fit the data.
Based on the above picture we propose the following model:
$$ \hat{y} = f_w(x) = \sin\left(w_0 x + w_1\right) $$In the following we implement a model in pure python:
def model(w, x):
return np.sin(w[0] * x + w[1])
This model given a value for $x$ makes a prediction for $y$ that depends on our choice of parameters $w$. Note that $w = \left[w_0, w_1\right]$ is a vector of parameters.
Let's try a few different parameter values and see how this model looks on our data. We will try the following three possible models:
\begin{align} f_{\left[1,1\right]}(x) & = \sin\left(x + 1\right) \\ f_{\left[2,1\right]}(x) & = \sin\left(2 x + 1\right) \\ f_{\left[1,2\right]}(x) & = \sin\left(x + 2\right) \end{align}# I will make 100 evenly spaced test points so that I can plot
# smoot curves
xtest = np.linspace(x.min()-.1, x.max()+.1, 100)
# Make three different predictions based on
# three different parameter values
yhat1 = model([1, 1], xtest)
yhat2 = model([2, 1], xtest)
yhat3 = model([1, 2], xtest)
fig = px.scatter(x=x, y=y)
fig.add_trace(go.Scatter(x=xtest, y=yhat1, name="$f_{[1,1]}(x) = \sin(x + 1)$"))
fig.add_trace(go.Scatter(x=xtest, y=yhat2, name="$f_{[2,1]}(x) = \sin(2x + 1)$"))
fig.add_trace(go.Scatter(x=xtest, y=yhat3, name="$f_{[1,2]}(x) = \sin(x + 2)$"))
None of the above really match the data. We would like to find a parameterization that is closer to the data. To do this we need a loss function.
To really fit the data we need a measure of how good our model is relative to the data. This is a loss function. For this exercise we will use the average squared loss which is often just called the squared loss.
$$ L(w) = L\left(f_w, \mathcal{D} = \left\{(x_i, y_i \right\}_{i=1}^n\right) = \frac{1}{n} \sum_{i=1}^n\left(y_i - f_w\left(x_i\right)\right)^2 = \frac{1}{n} \sum_{i=1}^n\left(y_i - \sin\left(w_0 x_i + w_1 \right)\right)^2 $$We can implement the loss function in python:
def avg_sq_loss(w):
return np.mean((y - model(w,x))**2)
How did our three model guesses compare?
print("Squared Loss of w = [1,1]", avg_sq_loss([1,1]))
Visualizing the loss for all three possible models?
fig = px.bar(x=["$w=[1,1]$", "$w=[2,1]$", "$w=[1,2]$"],
y=[avg_sq_loss([1,1]), avg_sq_loss([2,1]), avg_sq_loss([1,2])])
fig.update_yaxes(title="Average Squared Loss")
We can try a bunch of possible values for $w_0$ and $w_1$ using the following for loops:
loss_trials = []
w0values = np.linspace(0, 3, 30)
w1values = np.linspace(0, 3, 30)
for w0 in w0values:
for w1 in w1values:
loss_trials.append([w0, w1, avg_sq_loss([w0, w1])])
loss_df = pd.DataFrame(loss_trials, columns=["w0", "w1", "loss"])
loss_df
We can use our brute force search over values of $w$ to select the best combination.
loss_df.sort_values("loss").head()
We can use pandas
w_best_brute = loss_df.loc[loss_df["loss"].idxmin(), ["w0", "w1"]].to_numpy()
print(w_best_brute)
Visualizing the burte force best fit model:
yhat_brute = model(w_best_brute, xtest)
fig = px.scatter(x=x, y=y)
fig.add_trace(go.Scatter(x=xtest, y=yhat_brute, name="Brute Force"))
Not a bad fit! Let's see if we can do better using gradient descent.
fig = go.Figure()
fig.add_trace(go.Surface(x=w0values, y=w1values,
z=loss_df["loss"].to_numpy().reshape((len(w0values), len(w1values)))))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0),
height=600)
fig
fig = go.Figure()
fig.add_trace(go.Contour(x=loss_df["w0"], y=loss_df["w1"], z=loss_df["loss"]))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0),
height=600, width=800)
fig
In order to do gradient descent we first need a function to compute the gradient. Since we won't be using PyTorch in this notebook we will have to do calculus by hand. Recall the loss function is:
$$ L(w) = \frac{1}{n} \sum_{i=1}^n\left(y_i - \sin\left(w_0 x_i + w_1 \right)\right)^2 $$W need to compute the gradient, denoted $\nabla_w$, of $L(w)$:
$$ \nabla_w L(w) = \left[ \frac{\partial}{\partial w_1} L(w), \frac{\partial}{\partial w_2} L(w) \right] $$We can compute the answer in parts:
We can now implement a gradient function:
def gradient(w):
g0 = -np.mean(2 * (y - np.sin(w[0] * x + w[1]))*np.cos(w[0]*x + w[1])*x)
g1 = -np.mean(2 * (y - np.sin(w[0] * x + w[1]))*np.cos(w[0]*x + w[1]))
return np.array([g0, g1])
gradient([1., 1.])
Now that we have implemented the gradient we can compute the value of the gradient at each point in our above plot:
loss_grad_df = loss_df.join(loss_df[['w0', 'w1']]
.apply(lambda w: gradient(w), axis=1, result_type="expand")
.rename(columns={0:"g0", 1:"g1"}))
loss_grad_df
The following plots the gradient field.
fig = go.Figure()
fig = ff.create_quiver(x=loss_grad_df['w0'], y=loss_grad_df['w1'],
u=loss_grad_df['g0'], v=loss_grad_df['g1'],
line_width=2, line_color="white",
scale = 0.1, arrow_scale=.2)
fig.add_trace(go.Contour(x=loss_grad_df['w0'], y=loss_grad_df['w1'], z=loss_grad_df['loss']))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0),
height=600, width=800)
fig.update_layout(xaxis_range=[w0values.min(), w0values.max()])
fig.update_layout(yaxis_range=[w1values.min(), w1values.max()])
fig
The following implements a simple version of gradient descent for this dataset.
def gradient_descent(w_0, lr = lambda t: 1./(t+1.), nepochs=10):
w = w_0.copy()
values = [w]
for t in range(nepochs):
w = w - lr(t) * gradient(w)
values.append(w)
return np.array(values)
In the following we run gradient descent starting at $w=[3.0, 0.0]$
values = gradient_descent(np.array([3.0, 0.0]),
nepochs=100,
lr =lambda t: 1./np.sqrt(t+1.))
We plot the execution path through the space.
fig = go.Figure()
fig.add_trace(go.Contour(x=loss_grad_df['w0'], y=loss_grad_df['w1'], z=loss_grad_df['loss']))
fig.add_trace(go.Scatter(x=values[:,0], y=values[:,1], name="Path", mode="markers+lines",
line=go.scatter.Line(color='white')))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0),
height=600, width=800)
fig.update_layout(xaxis_range=[w0values.min(), w0values.max()])
fig.update_layout(yaxis_range=[w1values.min(), w1values.max()])
fig
fig = go.Figure()
fig.add_trace(
go.Surface(x=w0values, y=w1values,
z=loss_df["loss"].to_numpy().reshape((len(w0values), len(w1values)))))
fig.add_trace(
go.Scatter3d(x=values[:,1], y=values[:,0], z=[avg_sq_loss(w) for w in values],
line=dict(color='white')))
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0),
height=600)
fig