from scipy.optimize import minimize
minimize(loss, x0 = ...)
So... let's call it a day?
<img src="nytrecs.png", width=50%>
Design a model: a vector of "weights" θ=(θ1,θ2) such that rui≈xi⋅θ=xi1θ1+xi2θ2.
Notice the right-hand-side above does not depend on u, so this model is not personalized.
<img src="volinsky.png", width=50%>
(Koren, Bell & Volinsky 2009)
An unconstrained optimization problem is denoted
minxf(x), where
In our context, we have a loss function ℓ(θ) and want to solve
minθℓ(θ),
which is to say we want to find ˆθ such that ℓ(ˆθ)≤ℓ(θ) for all θ.
Say ℓ(θ)=(θ−1)2.
plt.figure(figsize=(8,5))
x = np.arange(-5, 5, .01)
plt.plot(x, (x-1)**2, 'b')
plt.plot(1, 0.0, '|', color = 'r',
markersize = 20, markeredgewidth = 4)
plt.xlabel('$\\theta$')
plt.ylabel('$\ell(\\theta)$')
plt.show()
Now ℓ(θ)=(θ−1)2+(θ+2)2
plt.figure(figsize=(8,5))
theta = np.arange(-4, 4, .01)
a, b = -2, 1
plt.plot(theta, (theta-a)**2, 'k--')
plt.plot(theta, (theta-b)**2, 'k--')
plt.plot(theta, (theta-a)**2+(theta-b)**2, 'b')
plt.plot([a,b], [0]*2, '|', color = 'k',
markersize = 20, markeredgewidth = 4)
plt.plot(0.5*(a+b), 4.5, '|', color = 'r',
markersize = 20, markeredgewidth = 4)
plt.xlabel('$\\theta$')
plt.ylabel('$\ell(\\theta)$')
plt.show()
Now ℓ(θ)=θ2(θ2−4)2
plt.figure(figsize=(8,5))
x = np.arange(-2.5, 2.5, .01)
plt.plot(x, x**2*(x**2-4)**2, 'b')
plt.plot([-2,-np.sqrt(4/3),0,np.sqrt(4/3),2],
[0, 256/27, 0, 256/27, 0], '|', color='r',
markersize = 20, markeredgewidth = 4)
plt.xlabel('$\\theta$')
plt.ylabel('$\ell(\\theta)$')
plt.show()
Remember that dℓdθ=0 does not imply θ is a minimizer!
(might try 2nd derivative test)
plt.figure(figsize=(20,10))
xx = np.linspace(-4,4,50)
yy = 3*(xx+1)**2 + 2
plt.subplot(1,2,1)
plt.plot(xx,yy)
plt.plot(np.linspace(-2,1,50), 3*np.linspace(-2,1,50) + 11,'r-')
plt.plot([-2,1],[5,14],'kx',markersize=20,markeredgewidth=2)
plt.ylim([-20,80])
plt.title("zeroth order condition for convexity") # $f(\lambda x+(1-\lambda)y) \leq \lambda f(x) + (1-\lambda)f(y)$
plt.subplot(1,2,2)
plt.plot(xx,yy)
plt.plot(xx,12*xx+2,'r-')
plt.plot(1,14,'kx',markersize=20,markeredgewidth=2)
plt.ylim([-20,80])
plt.title("first order condition for convexity") # $f(y) \geq f(x) + (y-x)\\frac{df}{dx}(x)$
plt.show()
ℓ(η+λ(θ−η))≤ℓ(η)+λ(ℓ(θ)−ℓ(η)).
ℓ(η+λ(θ−η))−ℓ(η)λ≤ℓ(θ)−ℓ(η).
ℓ(η+λ(θ−η))−ℓ(η)λ(θ−η)⏟→ℓ′(η)(θ−η)≤(ℓ(θ)−ℓ(η)).
ℓ(η)+ℓ′(η)(θ−η)≤ℓ(θ).
a, b = -3, 3
x = np.arange(a, b, .05)
y = np.arange(a, b, .05)
X, Y = np.meshgrid(x, y)
z = X**2 + 4*Y**2
p2 = Surface(name = "Loss Surface",
x = x,
y = y,
z = z,
colorscale = 'Viridis',
reversescale = True,
showscale = False)
paraboloid = Contour(name = "Loss Contour",
x = x,
y = y,
z = z,
colorscale = 'Viridis',
reversescale = True,
autocontour = True,
xaxis = 'x2',
yaxis = 'y2')
lay = Layout(xaxis = {'range' : [a, b],
'title' : '$\\theta_1$'},
yaxis = {'range' : [a, b],
'title' : '$\\theta_2$'},
scene = {"domain": {'x': [0, 0.48],
'y': [0, 1]},
'xaxis' : {'title' : 'θ1'},
'yaxis' : {'title' : 'θ2'},
'zaxis' : {'title' : 'ℓ(θ1,θ2)'}},
xaxis2= {'domain' : [.52, 1],
'range' : [a, b],
'title' : '$\\theta_1$'},
yaxis2= {'anchor' : 'x2',
'range' : [a, b],
'title' : '$\\theta_2$'},
autosize = False,
width = 1000,
height = 600)
iplot(Figure(data = Data([p2, paraboloid]), layout = lay))
Gradient: steepest ascent, perpendicular to level curves (contours)
import plotly.figure_factory as ff
a, b = -3, 3
x = np.arange(a, b, .05)
y = np.arange(a, b, .05)
trace1 = Contour(name = "Loss Contour",
x = x,
y = y,
z = z,
colorscale = 'Viridis',
reversescale = True,
showscale = False,
autocontour = True,)
lay = Layout(xaxis = {'range' : [a, b],
'title' : '$\\theta_1$'},
yaxis = {'range' : [a, b],
'title' : '$\\theta_2$'},
width = 600,
height = 600)
xx = np.linspace(a, b, 7)
yy = np.linspace(a, b, 7)
X, Y = np.meshgrid(xx, yy)
trace2 = ff.create_quiver(X, Y, -2*X, -8*Y,
scale = .025,
arrow_scale = .15,)['data'][0]
trace2['name'] = "Negative Gradient"
trace2['marker']['color'] = 'black'
trace2['marker']['line']['width'] = 8
trace2['marker']['size'] = 8
iplot(Figure(data = Data([trace1, trace2]), layout = lay))
xx = np.linspace(-4,4,50)
f = lambda x: 3*(x+1)**2 + 2
df = lambda x: 6*(x+1)
yy = f(xx)
alpha = .2
x0 = 1
x1 = x0 - alpha*df(x0)
x2 = x1 - alpha*df(x1)
plt.figure(figsize=(10,10))
plt.plot(xx,yy)
plt.plot(xx,12*xx+2,'r-')
plt.plot(x0,-19,'|k',
markersize = 20, markeredgewidth = 4)
plt.plot(x0,f(x0),'.k',
markersize = 15, markeredgewidth = 4)
plt.annotate('$\\theta^{(0)}$', [x0 - .1, -17])
plt.arrow(x0, -19, -alpha*df(x0), 0,
width = .5, head_length = 0.5, length_includes_head = True)
plt.annotate('$-0.2\ell\'(\\theta^{(0)})$', [x1-1, -17])
plt.axvline(x=-1, ls='--')
plt.ylim([-20,80])
plt.title('$\\ell(\\theta) = 3(\\theta+1)^2+2$')
plt.show()
plt.figure(figsize=(10,10))
plt.plot(xx,yy)
plt.plot(xx,12*xx+2,'r-')
plt.plot(x1,-19,'|k',
markersize = 20, markeredgewidth = 4)
plt.plot(x1,f(x1),'.k',
markersize = 15, markeredgewidth = 4)
plt.annotate('$\\theta^{(1)}$', [x1 - .1, -17])
plt.arrow(x1, -19, -alpha*df(x1), 0,
width = .25, head_length = 0.25, length_includes_head = True)
plt.annotate('$-0.2\ell\'(\\theta^{(1)})$', [x1+1, -17])
plt.axvline(x = -1, ls='--')
plt.ylim([-20, 80])
plt.title('$\\ell(\\theta) = 3(\\theta+1)^2+2$')
plt.show()
plt.figure(figsize=(10,10))
plt.plot(xx,yy)
plt.plot(xx,12*xx+2,'r-')
plt.plot(x2,-19,'|k',
markersize = 20, markeredgewidth = 4)
plt.plot(x2,f(x2),'.k',
markersize = 15, markeredgewidth = 4)
plt.annotate('$\\theta^{(2)}$', [x2 - .1, -17])
plt.arrow(x2, -19, -alpha*df(x2), 0,
width = .05, head_length = 0.05, length_includes_head = True)
plt.axvline(x = -1, ls='--')
plt.ylim([-20, 80])
plt.title('$\\ell(\\theta) = 3(\\theta+1)^2+2$')
plt.show()
Initialize value θ(0) (zeros, random guess, or output of another method)
For t from 0 until convergence, update
θ(t+1)←θ(t)−α∇ℓ(θ(t))
TOGGLER
def gradient_descent(theta0, grad, alpha, tol = 1e-4, max_iter = 1e5):
theta_path = [theta0]
i = 0
while(np.linalg.norm(grad(theta_path[-1])) > tol) & (i < max_iter):
i += 1
theta_t = theta_path[-1]
theta_path.append(theta_t - alpha*grad(theta_t))
return np.array(theta_path)
import plotly.figure_factory as ff
a, b = -3, 3
x = np.arange(a, b, .05)
y = np.arange(a, b, .05)
trace1 = Contour(name = "Loss Contour",
x = x,
y = y,
z = z,
colorscale = 'Viridis',
reversescale = True,
showscale = False,
autocontour = True,)
lay = Layout(xaxis = {'range' : [a, b],
'title' : '$\\theta_1$'},
yaxis = {'range' : [a, b],
'title' : '$\\theta_2$'},
width = 600,
height = 600)
def trace2(theta0 = np.array([2.3, 2.5]), alpha = 0.01):
theta_path = gradient_descent(theta0,
lambda theta: np.array([2*theta[0], 8*theta[1]]),
alpha,
tol = 1e-2)
return Scatter(name = "Theta Path",
x = theta_path[:,0],
y = theta_path[:,1],
mode = "lines+markers")
title = lambda a : '$\\alpha = ' + str(a) + '$'
iplot(Figure(data = [trace1, trace2(alpha = 0.01)], layout = {**lay, **{'title': title(.01)}}))
iplot(Figure(data = [trace1, trace2(alpha = 0.1)], layout = {**lay, **{'title': title(.1)}}))
iplot(Figure(data = [trace1, trace2(alpha = 0.23)], layout = {**lay, **{'title': title(.23)}}))
import autograd.numpy as np
from autograd import grad
def loss(theta):
...
grad(loss)
!pip install surprise
rnames = ['uid', 'iid', 'rating', 'time']
ratings = pd.read_csv('./ml-100k/u.data',
sep = '\t',
header = None,
names = rnames)\
.drop(columns = ['time'])
inames = ['iid', 'title', 'release date', 'video release date', 'IMDb URL',
'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s',
'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('./ml-100k/u.item',
sep = '|',
header = None,
names = inames,
encoding = 'iso8859_2')\
.drop(columns = ['release date', 'video release date',
'IMDb URL', 'unknown'])
item_names = items.iloc[:,[0,1]].set_index('iid')
tot_rating = ratings.groupby('iid').sum().drop(columns = ['uid'])
avg_rating = ratings.groupby('iid').mean().drop(columns = ['uid'])
top_titles = avg_rating[(tot_rating > 1300) & (avg_rating > 3.82)].dropna().join(item_names)
inds = list(top_titles.index-1)
top_titles.sort_values('rating', ascending = False)
In this model, both user vectors yu and item vectors xi are unobserved: rui≈xi⋅yu
plt.figure(figsize = (17, 17))
plt.plot(algo.qi[inds, 0], algo.qi[inds, 1], 'k.')
for i in inds:
title = item_names.loc[i+1,'title'][:-6]
if title.strip().endswith(', The'):
title = "The " + title[:-6]
plt.annotate(title, algo.qi[i])
plt.xticks([])
plt.yticks([])
plt.show()
"Since all models are wrong the scientist
must be alert to what is importantly wrong.
It is inappropriate to be concerned about
mice when there are tigers abroad."
- George E. P. Box, Science and Statistics