import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'


def arbitrary(x):
    return (x**4 - 15*x**3 + 80*x**2 - 180*x + 144)/10

x = np.linspace(1, 6.75, 200)
fig = px.line(y = arbitrary(x), x = x)


fig.update_layout(font_size = 16)


arbitrary(6)

0.0


def simple_minimize(f, xs):
    y = [f(x) for x in xs]  
    return xs[np.argmin(y)]


guesses = [5.3, 5.31, 5.32, 5.33, 5.34, 5.35]
simple_minimize(arbitrary, guesses)

5.33


xs = np.linspace(1, 7, 200)
sparse_xs = np.linspace(1, 7, 5)

ys = arbitrary(xs)
sparse_ys = arbitrary(sparse_xs)

fig = px.line(x = xs, y = arbitrary(xs))
fig.add_scatter(x = sparse_xs, y = arbitrary(sparse_xs), mode = "markers")
fig.update_layout(showlegend= False)
fig.show()


from scipy.optimize import minimize

minimize(arbitrary, x0 = 3.5)

      fun: -0.13827491292966557
 hess_inv: array([[0.73848255]])
      jac: array([6.48573041e-06])
  message: 'Optimization terminated successfully.'
     nfev: 20
      nit: 3
     njev: 10
   status: 0
  success: True
        x: array([2.39275266])


import plotly.graph_objects as go

def derivative_arbitrary(x):
    return (4*x**3 - 45*x**2 + 160*x - 180)/10


fig = go.Figure()
roots = np.array([2.3927, 3.5309, 5.3263])

fig.add_trace(go.Scatter(x = xs, y = arbitrary(xs), 
                         mode = "lines", name = "f"))
fig.add_trace(go.Scatter(x = xs, y = derivative_arbitrary(xs), 
                         mode = "lines", name = "df", line = {"dash": "dash"}))
fig.add_trace(go.Scatter(x = np.array(roots), y = 0*roots, 
                         mode = "markers", name = "df = zero", marker_size = 12))
fig.update_layout(font_size = 20, yaxis_range=[-1, 3])
fig.show()


# Define some utility functions for the next example
import matplotlib.pyplot as plt

def plot_arbitrary():
    x = np.linspace(1, 7, 100)
    plt.plot(x, arbitrary(x))
    axes = plt.gca()
    axes.set_ylim([-1, 3])
    
def plot_x_on_f(f, x):
    y = f(x)    
    default_args = dict(label=r'$ \theta $', zorder=2,
                        s=200, c=sns.xkcd_rgb['green'])
    plt.scatter([x], [y], **default_args)
    
def plot_x_on_f_empty(f, x):
    y = f(x)
    default_args = dict(label=r'$ \theta $', zorder=2,
                        s=200, c = 'none', edgecolor=sns.xkcd_rgb['green'])
    plt.scatter([x], [y], **default_args)    
    
def plot_tangent_on_f(f, x, eps=1e-6):
    slope = ((f(x + eps) - f(x - eps))
             / (2 * eps))
    xs = np.arange(x - 1, x + 1, 0.05)
    ys = f(x) + slope * (xs - x)
    plt.plot(xs, ys, zorder=3, c=sns.xkcd_rgb['green'], linestyle='--')


plot_arbitrary()
plot_x_on_f(arbitrary, 2)
plot_tangent_on_f(arbitrary, 2)
plt.xlabel('x')
plt.ylabel('f(x)');


plot_arbitrary()
plot_x_on_f(arbitrary, 4.4)
plot_tangent_on_f(arbitrary, 4.4)


def plot_one_step(x):
    # Find our new guess using the recurrence relation
    new_x = x - derivative_arbitrary(x)
    
    # Plot our old guess and our new guess on the function
    plot_arbitrary()
    plot_x_on_f(arbitrary, new_x)
    plot_x_on_f_empty(arbitrary, x)
    print(f'old x: {x}')
    print(f'new x: {new_x}')


plot_one_step(4)

old x: 4
new x: 4.4


plot_one_step(4.4)

old x: 4.4
new x: 5.0464000000000055


plot_one_step(5.0464)

old x: 5.0464
new x: 5.49673060106241


plot_one_step(5.4967)

old x: 5.4967
new x: 5.080917145374805


plot_one_step(5.080917145374805)

old x: 5.080917145374805
new x: 5.489966698640582


plot_one_step(5.489966698640582)

old x: 5.489966698640582
new x: 5.092848945470474


def plot_one_step_lr(x):
    # Implement our new algorithm with a learning rate
    new_x = x - 0.3 * derivative_arbitrary(x)
    
    # Plot the updated guesses
    plot_arbitrary()
    plot_x_on_f(arbitrary, new_x)
    plot_x_on_f_empty(arbitrary, x)
    print(f'old x: {x}')
    print(f'new x: {new_x}')


plot_one_step_lr(4)

old x: 4
new x: 4.12


plot_one_step_lr(4.12)

old x: 4.12
new x: 4.267296639999997


plot_one_step_lr(5.17180969114245)

old x: 5.17180969114245
new x: 5.256374838146257


plot_one_step_lr(5.323)

old x: 5.323
new x: 5.325108157959999


def gradient_descent(df, initial_guess, alpha, n):
    """Performs n steps of gradient descent on df using learning rate alpha starting
       from initial_guess. Returns a numpy array of all guesses over time."""
    guesses = [initial_guess]
    current_guess = initial_guess
    while len(guesses) < n:
        current_guess = current_guess - alpha * df(current_guess)
        guesses.append(current_guess)
        
    return np.array(guesses)


trajectory = gradient_descent(derivative_arbitrary, 1.6, 0.75, 20)

plot_arbitrary()
plt.scatter(trajectory, arbitrary(trajectory), c="white", edgecolor="firebrick")
plt.scatter(trajectory[-1], arbitrary(trajectory[-1]), c="firebrick")
plt.xlabel(r"$x$")
plt.ylabel(r"$f(x)$");


df = sns.load_dataset("tips")
df.head()


def mse_single_arg(theta_1):
    """Returns the MSE on our data for the given theta1"""
    x = df["total_bill"]
    y_obs = df["tip"]
    y_hat = theta_1 * x
    return np.mean((y_hat - y_obs) ** 2) 

loss_df = pd.DataFrame({"theta_1":np.linspace(-1.5, 1), "MSE":[mse_single_arg(theta_1) for theta_1 in np.linspace(-1.5, 1)]})

px.line(loss_df, x="theta_1", y="MSE")


def gradient_descent(df, initial_guess, alpha, n):
    """Performs n steps of gradient descent on df using learning rate alpha starting
       from initial_guess. Returns a numpy array of all guesses over time."""
    guesses = [initial_guess]
    current_guess = initial_guess
    while len(guesses) < n:
        current_guess = current_guess - alpha * df(current_guess)
        guesses.append(current_guess)
        
    return np.array(guesses)


def mse_single_arg(theta_1):
    """Returns the MSE on our data for the given theta1"""
    x = df["total_bill"]
    y_obs = df["tip"]
    y_hat = theta_1 * x
    return np.mean((y_hat - y_obs) ** 2)


def mse_loss_derivative_single_arg(theta_1):
    """Returns the derivative of the MSE on our data for the given theta1"""
    x = df["total_bill"]
    y_obs = df["tip"]
    y_hat = theta_1 * x
    
    return np.mean(2 * (y_hat - y_obs) * x)


trajectory = gradient_descent(mse_loss_derivative_single_arg, -0.5, 0.0001, 100)

plt.plot(loss_df["theta_1"], loss_df["MSE"])
plt.scatter(trajectory, [mse_single_arg(guess) for guess in trajectory], c="white", edgecolor="firebrick")
plt.scatter(trajectory[-1], mse_single_arg(trajectory[-1]), c="firebrick")
plt.xlabel(r"$\theta_1$")
plt.ylabel(r"$L(\theta_1)$");

print(f"Final guess for theta_1: {trajectory[-1]}")

Final guess for theta_1: 0.14369554654231262


tips_with_bias = df.copy()
tips_with_bias["bias"] = 1
tips_with_bias = tips_with_bias[["bias", "total_bill"]]

tips_with_bias.head()


def mse_loss(theta, X, y_obs):
    y_hat = X @ theta
    return np.mean((y_hat - y_obs) ** 2)


import plotly.graph_objects as go

uvalues = np.linspace(0, 2, 10)
vvalues = np.linspace(-0.1, 0.35, 10)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))

def mse_loss_single_arg(theta):
    return mse_loss(theta, tips_with_bias, df["tip"])

MSE = np.array([mse_loss_single_arg(t) for t in thetas.T])

loss_surface = go.Surface(x=u, y=v, z=np.reshape(MSE, u.shape))

ind = np.argmin(MSE)
optimal_point = go.Scatter3d(name = "Optimal Point",
    x = [thetas.T[ind,0]], y = [thetas.T[ind,1]], 
    z = [MSE[ind]],
    marker=dict(size=10, color="red"))

fig = go.Figure(data=[loss_surface, optimal_point])
fig.update_layout(scene = dict(
    xaxis_title = "theta0",
    yaxis_title = "theta1",
    zaxis_title = "MSE"), autosize=False, width=800, height=600)

fig.show()


contour = go.Contour(x=u[0], y=v[:, 0], z=np.reshape(MSE, u.shape))
fig = go.Figure(contour)
fig.update_layout(
    xaxis_title = "theta0",
    yaxis_title = "theta1", autosize=False, width=800, height=600)

fig.show()


df_with_bias = df.copy()
df_with_bias["bias"] = 1

def mse_gradient(theta, X, y_obs):
    """Returns the gradient of the MSE on our data for the given theta"""    
    x0 = X.iloc[:, 0]
    x1 = X.iloc[:, 1]
    dth0 = np.mean(-2 * (y_obs - theta[0]*x0 - theta[1]*x1) * x0)
    dth1 = np.mean(-2 * (y_obs - theta[0]*x0 - theta[1]*x1) * x1)
    return np.array([dth0, dth1])

def mse_gradient_single_arg(theta):
    """Returns the gradient of the MSE on our data for the given theta"""
    X = df_with_bias[["bias", "total_bill"]]
    y_obs = df["tip"]
    return mse_gradient(theta, X, y_obs)

def mse_single_arg(theta_1):
    """Returns the MSE on our data for the given theta1"""
    x = df["total_bill"]
    y_obs = df["tip"]
    y_hat = theta_1 * x
    return np.mean((y_hat - y_obs) ** 2)


guesses = gradient_descent(mse_gradient_single_arg, np.array([0, 0]), 0.001, 10000)

pd.DataFrame(guesses, columns=["theta_0", "theta_1"]).tail(10)


def mse_gradient(theta, X, y_obs):
    """Returns the gradient of the MSE on our data for the given theta"""
    n = len(X)
    return -2 / n * (X.T @ y_obs - X.T @ X @ theta)

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	bias	total_bill
0	1	16.99
1	1	10.34
2	1	21.01
3	1	23.68
4	1	24.59

	theta_0	theta_1
9990	0.888098	0.106378
9991	0.888108	0.106378
9992	0.888119	0.106377
9993	0.888130	0.106377
9994	0.888141	0.106376
9995	0.888151	0.106376
9996	0.888162	0.106375
9997	0.888173	0.106375
9998	0.888184	0.106375
9999	0.888194	0.106374

Gradient Descent¶

Minimizing an Arbitrary 1D Function¶

The Naive Approach: Guess and Check¶

`scipy.optimize.minimize`¶

Better Approach: Gradient Descent¶

Manually Descending the Gradient¶

Formalizing Gradient Descent¶

Gradient Descent on a 1D Model¶

Gradient Descent on Multi-Dimensional Models¶

Defining a 2D MSE Function¶

Applying Gradient Descent in 2D¶

Extra: The General Matrix Form of the Gradient¶

Gradient Descent¶

Minimizing an Arbitrary 1D Function¶

The Naive Approach: Guess and Check¶

scipy.optimize.minimize¶

Better Approach: Gradient Descent¶

Manually Descending the Gradient¶

Formalizing Gradient Descent¶

Gradient Descent on a 1D Model¶

Gradient Descent on Multi-Dimensional Models¶

Defining a 2D MSE Function¶

Applying Gradient Descent in 2D¶

Extra: The General Matrix Form of the Gradient¶

`scipy.optimize.minimize`¶