import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
pd.options.mode.chained_assignment = None # default='warn'
import plotly.io as pio
pio.renderers.default = "notebook_connected"
Suppose we want to find the value of $x$ that minimizes the arbitrary function given below:
def arbitrary(x):
return (x**4 - 15*x**3 + 80*x**2 - 180*x + 144)/10
x = np.linspace(1, 6.75, 200)
fig = px.line(y = arbitrary(x), x = x)
fig.update_layout(font_size = 16)
fig.update_layout(autosize=False, width=800, height=600)
Above, we saw that the minimum is somewhere around 5.3ish. Let's see if we can figure out how to find the exact minimum algorithmically from scratch.
One way very slow and terrible way would be manual guess-and-check.
arbitrary(6)
0.0
A somewhat better (but still slow) approach is to use brute force to try out a bunch of x values and return the one that yields the lowest loss.
def simple_minimize(f, xs):
y = [f(x) for x in xs]
return xs[np.argmin(y)]
guesses = [5.3, 5.31, 5.32, 5.33, 5.34, 5.35]
simple_minimize(arbitrary, guesses)
5.33
This process is essentially the same as before where we made a graphical plot, it's just that we're only looking at 20 selected points.
xs = np.linspace(1, 7, 200)
sparse_xs = np.linspace(1, 7, 5)
ys = arbitrary(xs)
sparse_ys = arbitrary(sparse_xs)
fig = px.line(x = xs, y = arbitrary(xs))
fig.add_scatter(x = sparse_xs, y = arbitrary(sparse_xs), mode = "markers")
fig.update_layout(showlegend= False)
fig.update_layout(autosize=False, width=800, height=600)
fig.show()
This basic approach suffers from three major flaws:
scipy.optimize.minimize
¶One way to minimize this mathematical function is to use the scipy.optimize.minimize
function. It takes a function and a starting guess and tries to find the minimum.
from scipy.optimize import minimize
minimize(arbitrary, x0 = 3.5)
message: Optimization terminated successfully. success: True status: 0 fun: -0.13827491292966557 x: [ 2.393e+00] nit: 3 jac: [ 6.486e-06] hess_inv: [[ 7.385e-01]] nfev: 20 njev: 10
scipy.optimize.minimize
is great. It may also seem a bit magical. How can this one line of code find the minimum of any mathematical function so quickly?
Behind the scenes, scipy.optimize.minimize
uses a technique called gradient descent to compute the minimizing value of a function. In this lecture, we will learn the underlying theory behind gradient descent, then implement it ourselves.
Instead of choosing all of our guesses ahead of time, we can instead start from a single guess and try to iteratively improve on our choice.
They key insight is this: If the derivative of the function is negative, that means the function is decreasing, so we should go to the right (i.e. pick a bigger x). If the derivative of the function is positive, that means the function is increasing, so we should go to the left (i.e. pick a smaller x).
Thus, the derivative tells us which way to go.
Desmos demo: https://www.desmos.com/calculator/twpnylu4lr
import plotly.graph_objects as go
def derivative_arbitrary(x):
return (4*x**3 - 45*x**2 + 160*x - 180)/10
fig = go.Figure()
roots = np.array([2.3927, 3.5309, 5.3263])
fig.add_trace(go.Scatter(x = xs, y = arbitrary(xs),
mode = "lines", name = "f"))
fig.add_trace(go.Scatter(x = xs, y = derivative_arbitrary(xs),
mode = "lines", name = "df", line = {"dash": "dash"}))
fig.add_trace(go.Scatter(x = np.array(roots), y = 0*roots,
mode = "markers", name = "df = zero", marker_size = 12))
fig.update_layout(font_size = 20, yaxis_range=[-1, 3])
fig.update_layout(autosize=False, width=800, height=600)
fig.show()
# Define some utility functions for the next example
import matplotlib.pyplot as plt
def plot_arbitrary():
x = np.linspace(1, 7, 100)
plt.plot(x, arbitrary(x))
axes = plt.gca()
axes.set_ylim([-1, 3])
def plot_x_on_f(f, x):
y = f(x)
default_args = dict(label=r'$ \theta $', zorder=2,
s=200, c=sns.xkcd_rgb['green'])
plt.scatter([x], [y], **default_args)
def plot_x_on_f_empty(f, x):
y = f(x)
default_args = dict(label=r'$ \theta $', zorder=2,
s=200, c = 'none', edgecolor=sns.xkcd_rgb['green'])
plt.scatter([x], [y], **default_args)
def plot_tangent_on_f(f, x, eps=1e-6):
slope = ((f(x + eps) - f(x - eps))
/ (2 * eps))
xs = np.arange(x - 1, x + 1, 0.05)
ys = f(x) + slope * (xs - x)
plt.plot(xs, ys, zorder=3, c=sns.xkcd_rgb['green'], linestyle='--')
plot_arbitrary()
plot_x_on_f(arbitrary, 2)
plot_tangent_on_f(arbitrary, 2)
plt.xlabel('x')
plt.ylabel('f(x)');
plot_arbitrary()
plot_x_on_f(arbitrary, 4.4)
plot_tangent_on_f(arbitrary, 4.4)
Armed with this knowledge, let's try to see if we can use the derivative to optimize the function.
We start by making some guess for the minimizing value of $x$. Then, we look at the derivative of the function at this value of $x$, and step downhill in the opposite direction. We can express our new rule as a recurrence relation:
$$x^{(t+1)} = x^{(t)} - \frac{d}{dx} f(x^{(t)})$$Translating this statement into English: we obtain our next guess for the minimizing value of $x$ at timestep $t+1$ ($x^{(t+1)}$) by taking the guess our last guess ($x^{(t)}$) and subtracting the derivative of the function at that point ($\frac{d}{dx} f(x^{(t)})$).
arbitrary
represents the function we are trying to minimize, $f$.
derivative_arbitrary
represents the first derivative of this function, $\frac{df}{dx}$.
def plot_one_step(x):
# Find our new guess using the recurrence relation
new_x = x - derivative_arbitrary(x)
# Plot our old guess and our new guess on the function
plot_arbitrary()
plot_x_on_f(arbitrary, new_x)
plot_x_on_f_empty(arbitrary, x)
print(f'old x: {x}')
print(f'new x: {new_x}')
plot_one_step(4)
old x: 4 new x: 4.4
plot_one_step(4.4)
old x: 4.4 new x: 5.0464000000000055
plot_one_step(5.0464)
old x: 5.0464 new x: 5.49673060106241
plot_one_step(5.4967)
old x: 5.4967 new x: 5.080917145374805
plot_one_step(5.080917145374805)
old x: 5.080917145374805 new x: 5.489966698640582
plot_one_step(5.489966698640582)
old x: 5.489966698640582 new x: 5.092848945470474
Looking pretty good! We do have a problem though – once we arrive close to the minimum value of the function, our guesses "bounce" back and forth past the minimum without ever reaching it.
In other words, each step we take when updating our guess moves us too far. We can address this by decreasing the size of each step.
Let's update our algorithm to use a learning rate (also sometimes called the step size), which controls how far we move with each update. We represent the learning rate with $\alpha$.
$$x^{(t+1)} = x^{(t)} - \alpha \frac{d}{dx} f(x^{(t)})$$A small $\alpha$ means that we will take small update steps; a large $\alpha$ means we will take large steps.
Let's update our function to use $\alpha=0.3$.
def plot_one_step_lr(x):
# Implement our new algorithm with a learning rate
new_x = x - 0.3 * derivative_arbitrary(x)
# Plot the updated guesses
plot_arbitrary()
plot_x_on_f(arbitrary, new_x)
plot_x_on_f_empty(arbitrary, x)
print(f'old x: {x}')
print(f'new x: {new_x}')
plot_one_step_lr(4)
old x: 4 new x: 4.12
plot_one_step_lr(4.12)
old x: 4.12 new x: 4.267296639999997
plot_one_step_lr(5.17180969114245)
old x: 5.17180969114245 new x: 5.256374838146257
plot_one_step_lr(5.323)
old x: 5.323 new x: 5.325108157959999