import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Configure nice plotting defaults - (this must be done in a cell separate
# from %matplotlib call)
plt.style.use('seaborn')
sns.set_context('talk', font_scale=1.4)
plt.rcParams['figure.figsize'] = (10, 7)
We're also going to use plotly for interactive plots.
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')
A waiter maintained a record of his tips, total bill, and information about the person who paid the bill.
data = sns.load_dataset("tips")
print("Number of Records:", len(data))
data.head()
Perhaps the waiter was interested in:
We build models to help answer both of these questions.
sns.distplot(data['tip'], rug=True)
plt.xlabel("Tip in Dollars")
plt.savefig("tip_in_dollars.pdf")
sns.distplot(data['total_bill'], rug=True)
plt.xlabel("Total Bill in Dollars")
plt.savefig("bill.pdf")
In many countries it is common to tip a percentage of the bill.
$$\Large \texttt{pcttip} = \frac{\texttt{tip}}{\texttt{totalbill}} * 100 $$
However, the exact percentage varies by region, dining environment, and service quality.
Question? What is the percentage tip that this waiter receives? This will be our first estimation goal.
We will extend our data with an additional derived variable containing the percent tip:
data['pcttip'] = data['tip']/data['total_bill'] * 100
Examining it's distribution we see:
sns.distplot(data['pcttip'], rug=True)
plt.xlabel("Percent Tip")
plt.savefig("percent_tip.pdf")
There are some large outliers that are skewing the plot. Why? What might be an explanation?
text = ["bill $" + str(bill) + ", tip $" + str(tip) for
(bill,tip) in data[['total_bill', 'tip']].itertuples(index=False)]
py.iplot(ff.create_distplot([data['pcttip']], group_labels=["Percent Tip"],
rug_text=[text]))
We can try to remove the outliers to get a better picture.
pcttips = data['pcttip']
# Removing data points that are greater than the 99% quantile
sns.distplot(pcttips[pcttips < pcttips.quantile(0.99)], rug=True)
plt.xlabel("Percent Tip")
plt.savefig("percent_tip.pdf")
pcttips = data['pcttip']
# Removing data points that are greater than the 99% quantile
# sns.distplot(pcttips[pcttips < pcttips.quantile(0.99)], rug=True)
py.iplot(ff.create_distplot([pcttips[pcttips < pcttips.quantile(0.99)]], group_labels=["Percent Tip"],
bin_size=.5))
Let's begin with the goal of estimating the percentage tip that this waiter receives. The percentage tip could depend on a lot of factors including the time of day, to the characteristics of the patrons, and the service of the waiter.
However, let's start simple by ignoring these factors and defining the percentage tip as a constant value:
$$\Large \text{percentage tip} = \theta^* $$
Here the parameter $\theta^*$ is the true tip rate. We don't know this value but we believe that such a value exists. The $*$ character indicates that this is the unknown quantity that the universe determines. I like to remember that the star $*$ quantity is determined by the universe.
Is this model correct? Maybe ... probably not. However, this model probably provides reasonable predictions about future tips and might provide some insight into common tipping practices.
How do we estimate the value for $\theta^*$ from our data? This is the fundamental question of estimation.
There are actually many ways we could estimate this parameter.
Which of these strategies is best? It depends on our goals.
One method to estimate the value for the parameter $\theta^*$ is to define an objective function which characterizes the quality of our choice of $\theta^*$ with respect to the data. We can then choose the best parameter $\theta$ with respect to this objective function.
How do we define an objective function?
Perhaps a natural objective function would penalize parameter values for $\theta^*$ that are inconsistent with the data.
At the core of most estimation procedures is a loss function which measures how well a model fits the data. Suppose we observed a single tip percentage $y$ and our model predicts $\theta$ as our guess for $\theta^*$, then any of the following might be appropriate loss functions
\alpha \left(\left| y - \theta \right| - \frac{\alpha}{2} \right) & \text{otherwise}
\end{cases}
$$These loss functions effectively measure the error in our estimate relative to the data. Let's examine how they work on a real observations.
Suppose we observe a single tip percentage for $16\%$:
y_obs = 16
We will try a range of possible $\theta$ values
thetas = np.linspace(0, 50, 200)
$$\Large L\left(\theta, y \right) = \left( y - \theta \right)^2 $$
def squared_loss(est, y_obs):
return (est - y_obs)**2
thetas = np.linspace(0, 50, 200)
loss = squared_loss(thetas, y_obs)
plt.plot(thetas, loss, label="Squared Loss")
plt.vlines(y_obs, -100, 0, colors="r", label="Observation")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend(loc=4)
plt.savefig("l2_plt.pdf")
From the above plot we see that the value for $\theta$ that minimizes the loss corresponds to $\theta = y = 16\%$. Let's look at the other loss functions:
$$\Large L\left(\theta, y \right) = \left| y - \theta \right| $$
def abs_loss(est, y_obs):
return np.abs(y_obs - est)
thetas = np.linspace(0, 50, 200)
loss = abs_loss(thetas, y_obs)
plt.plot(thetas, loss, label="Abs Loss")
plt.vlines(y_obs, -5, -2,colors="r", label="Observation")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig("l1_plt.pdf")
Again we see that the loss function is minimized when $\theta = y = 16\%$ =
$$\Large L_\alpha\left( \theta, y \right) = \begin{cases} \frac{1}{2}\left( y - \theta \right)^2 & \left| y - \theta \right| < \alpha \\ \alpha \left(\left| y - \theta \right| - \frac{\alpha}{2} \right) & \text{otherwise} \end{cases} $$
def huber_loss(est, y_obs, alpha = 1):
d = abs_loss(est, y_obs)
return np.where(d < alpha,
squared_loss(est, y_obs) / 2.0,
alpha * (d - alpha / 2.0))
thetas = np.linspace(0, 50, 200)
loss = huber_loss(thetas, y_obs, alpha=5)
plt.plot(thetas, loss, label="Huber Loss")
plt.vlines(y_obs, -20, -5,colors="r", label="Observation")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend()
plt.savefig('huber_loss.pdf')
Let's define huberf
as just the loss function, fixing $\theta=0$:
def huberf(y, α=1):
ay = abs(y)
return np.where(ay < α, (y**2) / 2, α * (ay - α/2))
from ipywidgets import interact
y = np.linspace(-10, 10)
h1 = huberf(y, α=1)
y2 = .5*y**2
@interact
def _(α=(0, 8, .25)):
plt.plot(y, h1, label=rf"$\alpha = 1$")
plt.plot(y, huberf(y, α=α), label=rf"$\alpha = {α:.2f}$")
plt.plot(y, y2, label=r"$\frac{1}{2} y^2$")
plt.legend(loc='upper center')
plt.axvline(-α, color='r', lw=1, ls='-.')
plt.axvline( α, color='r', lw=1, ls='-.')
plt.ylim(0, 20)
In the following plot we plot all three loss functions.
thetas = np.linspace(0, 50, 200)
plt.plot(thetas, squared_loss(thetas, y_obs), label="Squared Loss")
plt.plot(thetas, abs_loss(thetas, y_obs), color='k', label="Abs Loss")
plt.plot(thetas, huber_loss(thetas, y_obs, alpha=2), color='m', label="Huber Loss")
plt.vlines(y_obs, -10, -5, colors="r", label="Observation")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.ylim([-20, 50])
plt.legend()
plt.savefig("loss_functions_compared.pdf")
The loss functions we have examined so far define the loss of the model with respect to a single data point. How can we extend this notion of loss to the entire dataset?
A simple and intuitive way to extend the loss to the entire dataset is to compute the average loss over all the data points. More formally, let the dataset $\mathcal{D}$ be the set of observations:
$$\Large \mathcal{D} = \{y_1, \ldots, y_n\} $$
where $y_i$ correspond to a single tip percentage and there are $n=244$ observations in our dataset.
Then we can define the average loss over the dataset as:
$$\Large L\left(\theta, \mathcal{D}\right) = \frac{1}{n} \sum_{i=1}^n L(\theta, y_i) $$
def avg_loss(loss, est, data):
return np.mean(np.array([loss(est, y_obs) for y_obs in data]), axis=0)
thetas = np.linspace(0, 50, 200)
loss = avg_loss(squared_loss, thetas, data['pcttip'])
plt.plot(thetas, loss, label="Squared Loss")
plt.vlines(data['pcttip'], -100, -30, colors="r", linewidth=0.4, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend(loc=5)
print("Minimizing Theta", thetas[np.argmin(loss)])
plt.savefig("l2_avg_loss.pdf")
thetas = np.linspace(0, 50, 200)
loss = avg_loss(abs_loss, thetas, data['pcttip'])
plt.plot(thetas, loss, label="Absolute Loss")
plt.vlines(data['pcttip'], -5, -2, colors="r", linewidth=0.4, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend(loc=5)
print("Minimizing Theta", thetas[np.argmin(loss)])
plt.savefig("l1_avg_loss.pdf")
thetas = np.linspace(0, 50, 200)
loss = avg_loss(huber_loss, thetas, data['pcttip'])
plt.plot(thetas, loss, label="Huber Loss")
plt.vlines(data['pcttip'], -5, -2, colors="r", linewidth=0.4, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend(loc=5)
print("Minimizing Theta", thetas[np.argmin(loss)])
plt.savefig("huber_avg_loss.pdf")
thetas = np.linspace(0, 50, 200)
a = plt.plot(thetas, avg_loss(squared_loss, thetas, data['pcttip'])/10, label="Squared Loss / 10 ")
plt.vlines(thetas[np.argmin(avg_loss(squared_loss, thetas, data['pcttip']))], 0, 20, linestyle='--',
color = a[0].get_color())
a = plt.plot(thetas, avg_loss(abs_loss, thetas, data['pcttip']), label="Absolute Loss")
plt.vlines(thetas[np.argmin(avg_loss(abs_loss, thetas, data['pcttip']))], 0, 20,
color = a[0].get_color(),linewidth=4)
a = plt.plot(thetas, avg_loss(huber_loss, thetas, data['pcttip']), label="Huber Loss")
plt.vlines(thetas[np.argmin(avg_loss(abs_loss, thetas, data['pcttip']))], 0, 20, linestyle='--',
color = a[0].get_color(),linewidth=2)
plt.vlines(data['pcttip'], -5, -2, colors="r", linewidth=0.4, label="Observations")
plt.xlabel(r"Choice for $\theta$")
plt.ylabel(r"Loss")
plt.legend(loc=5)
plt.ylim([-7,15])
plt.xlim([5,30])
plt.savefig("all_loss.pdf")
In the following plot we sample 5 data points at random and plot the various loss functions. A sample of 5 points is used to make it easier to visualize the corners in the absolute loss.
thetas = np.linspace(0, 50, 200)
np.random.seed(42) # seed the generator so plot doesn't change
tmp = data['pcttip'].sample(5)
loss_top = 20 # Need a top for the red lines
# Add the data points to the candidate thetas to enable visualization of corners
theta_sharp = np.sort(np.hstack((thetas, tmp)))
lines = [ # Plot the Loss functions
go.Scatter(x = theta_sharp, y = avg_loss(squared_loss, theta_sharp, data['pcttip'])-30, name='Squared Loss - 20'),
go.Scatter(x = theta_sharp, y = avg_loss(abs_loss, theta_sharp, tmp), name='Abs. Loss'),
go.Scatter(x = theta_sharp, y = avg_loss(huber_loss, theta_sharp, tmp), name='Huber Loss')
] + [ # Plot the red liens corresponding to data points
go.Scatter(x = [yobs, yobs], y=[0, loss_top],
mode='lines', showlegend=False,
line=dict(color='red', width = 0.5)) for yobs in tmp
]
# Assemble the plot
py.iplot(go.Figure(data = lines,
layout=dict(xaxis=dict(title="Theta Values"),
yaxis=dict(range=[0, loss_top]))))
In the above plot observe:
</br>
The differences in the above loss functions largely reduces to their sensitivity to outliers. Recall the distribution of tips (plotted below for interactive exploration).
py.iplot(ff.create_distplot([data['pcttip']], group_labels=["Percent Tip"],
rug_text=[["$" + str(s) for s in data['total_bill'].values]]))
Below we plot the contribution of each data point to the loss function (holding $\theta=15\%$)
yobs = np.sort(data['pcttip'])
theta_est = 15.0
plt.plot(yobs, squared_loss(yobs, theta_est) / np.sum(squared_loss(yobs, theta_est)), '-*', label='Squared Loss')
plt.plot(yobs, abs_loss(yobs, theta_est) / np.sum(abs_loss(yobs, theta_est)), '-*', label='Absolute Loss')
plt.plot(yobs, huber_loss(yobs, theta_est) / np.sum(huber_loss(yobs, theta_est)), '-*', label='Absolute Loss')
plt.xlabel(r"Tip as % of total bill")
plt.ylabel(r"Proportion of the Average Loss")
plt.legend()
# plt.ylim([-7,15])
# plt.xlim([5,30])
plt.savefig("prop_loss.pdf")
yobs = np.sort(data['pcttip'])
theta_est = 15.0
lines = [ # Plot the Loss functions
go.Scatter(x = yobs, y = squared_loss(yobs, theta_est) / np.sum(squared_loss(yobs, theta_est)) ,
name='Squared Loss', mode='lines+markers'),
go.Scatter(x = yobs, y = abs_loss(yobs, theta_est) / np.sum(abs_loss(yobs, theta_est)) ,
name='Absolute Loss', mode='lines+markers'),
go.Scatter(x = yobs, y = huber_loss(yobs, theta_est) / np.sum(huber_loss(yobs, theta_est)) ,
name='Huber Loss', mode='lines+markers')
]
# Assemble the plot
py.iplot(go.Figure(data = lines,
layout=dict(xaxis=dict(title="Theta Values"),
yaxis=dict(title="Fraction of the Loss"))))
Notice that the squared loss is much more sensitive to the far right data point than the Huber or Absolute loss functions.
In some cases, it is possible to use calculus to analytically compute the parameters $\theta$ that minimize the loss function. In particular, the use of the squared loss can often lead to a simple closed form estimator.
Using our earlier definition for the squared loss and data we can state the formal estimator $\hat{\theta}$:
$$\Large \hat{\theta} = \arg \min_{\theta} \frac{1}{n}\sum_{i=1}^n \left( \theta - y_i \right)^2 $$
To solve for the minimum of the loss function we will need to use some basic concepts from calculus.
Suppose we wanted to minimize the function:
$$\Large f(\theta) = (\theta - 3)^2 $$
which looks like:
plt.plot(thetas, (thetas-3)**3,'k', label="f")
plt.plot(thetas, 3*(thetas-3)**2, '-.', label=r"$\partial f / \partial \theta$")
plt.plot([3],[0], 'ro', label="Minimizer")
plt.legend(); plt.ylim(-5,10); _= plt.xlim(0, 6)
plt.savefig("calc_cubic.pdf")
plt.plot(thetas, -(3-thetas)**2,'k', label="f")
plt.plot(thetas, (3-thetas), '-.', label=r"$\partial f / \partial \theta$")
plt.plot([3],[0], 'ro', label="Minimizer")
plt.legend(); plt.ylim(-10,4); _= plt.xlim(0, 6)
plt.savefig("calc_concave.pdf")
f = lambda theta: (theta - 3)**2
df = lambda theta: 2 * (theta - 3)
ddf = lambda theta: 2 * np.ones(theta.shape)
thetas = np.linspace(0, 6, 100)
plt.axhline(0)
plt.plot(thetas, f(thetas), 'k', label="f")
plt.plot(thetas, df(thetas), '-.', label=r"$\partial f / \partial \theta$")
# plt.plot(thetas, ddf(thetas), '--', label=r"$\partial^2 f / \partial \theta^2$")
plt.plot([3],[0], 'ro', label="Minimizer")
plt.legend(); plt.ylim(-5,10); _= plt.xlim(0, 6)
plt.savefig("calc_rev.pdf")
We can find extreme points (local maximum and minimum) of $f(\theta) = (\theta - 3)^2$ by taking the derivative:
$$\Large \frac{\partial}{\partial \theta} f(\theta) = 2 (\theta - 3) $$
and setting to zero and solving for $\theta$
\begin{align} \Large 2 (\theta - 3) &= \Large 0 \\ \Large \theta - 3 &= \Large 0 \\ \Large \theta &= \Large 3 \end{align}
Is this a minimum or a maximum.
To determine if the extreme point is a minimum or a maximum we examine the second derivative:
$$\Large \frac{\partial^2}{\partial \theta^2} f(\theta) = 2 $$
If the second derivative is positive at the extreme point (when the first derivative is negative) then the point is a maximum (the function is curved upwards).
To simplify notation we define the function that we want to minimize:
$$\Large f(\theta) = \frac{1}{n}\sum_{i=1}^n \left(y_i - \theta\right)^2 $$
and take the first derivative:
$$\Large \frac{\partial}{\partial \theta} f(\theta) = -\frac{2}{n}\sum_{i=1}^n \left( y_i - \theta \right) $$
Setting the derivative equal to zero:
\begin{align} 0 &= -\frac{2}{n}\sum_{i=1}^n \left( y_i - \theta \right) \\ 0 &= \sum_{i=1}^n \left( y_i - \theta \right) \\ 0 &= \left(\sum_{i=1}^n y_i\right) - \left(\sum_{i=1}^n \theta\right) \\ 0 &= \left(\sum_{i=1}^n y_i\right) - n \theta \\ n \theta &= \sum_{i=1}^n y_i \\ \theta &= \frac{1}{n} \sum_{i=1}^n y_i \\ \end{align}
Thus the value for $\theta$ that minimizes the average squared loss is:
$$\Large \hat{\theta} = \frac{1}{n} \sum_{i=1}^n y_i \\ $$
While we have already graphically seen that the squared loss is curved upwards we can double check by taking the second derivative:
$$\Large f(\theta) = \frac{1}{n}\sum_{i=1}^n \left( \theta - y_i \right)^2 $$
$$\Large \frac{\partial}{\partial \theta} f(\theta) = \frac{2}{n}\sum_{i=1}^n \left( \theta - y_i \right) $$
$$\Large \frac{\partial^2}{\partial \theta^2} f(\theta) = \frac{2}{n}\sum_{i=1}^n 1 = 2 $$
which is positive when the derivative is zero (and indeed everywhere) and therefore our extreme point is a minimizer.
Minimize the absolute loss analytically can be a bit more challenging. However for our simple constant models we can actually derive an expression for the optimal $\theta$
Just as with the average squared loss, we want to minimize:
$$\Large f(\theta) = \frac{1}{n}\sum_{i=1}^n \left| y_i - \theta \right| $$
Just as before we take the first and second derivative of the loss function with respect to $\theta$.
f = lambda theta: np.abs(3 - theta)
df = lambda theta: -np.sign(3 - theta)
ddf = lambda theta: np.zeros(theta.shape)
thetas = np.linspace(2, 4, 1000)
plt.plot(thetas, f(thetas), 'k', label="f")
plt.plot(thetas, df(thetas), '.', label=r"$\partial f / \partial \theta$")
# plt.plot(thetas, ddf(thetas), '--', label=r"$\partial^2 f / \partial \theta^2$")
plt.plot([3],[0], 'ro', label="Minimizer")
plt.legend();
plt.savefig("l1_deriv.pdf")
$$\Large \frac{\partial}{\partial \theta} f(\theta) = \frac{1}{n}\sum_{i=1}^n \textbf{sign}(y_i -\theta) $$
$$\Large \frac{\partial^2}{\partial \theta^2} f(\theta) = 0 $$
It is important to note that both of the above equations are not defined at $\theta = y_i$
We will set set the sign of 0 to be zero implying that the gradient is zero for $\theta = y_i$:
$$\Large \textbf{sign}(0) = 0 $$
This is actually consistent with python:
np.sign(0)
While the derivative of the absolute value function is not defined around zero, we can define $\textbf{sign}(0) = 0$ and still attempt to find a $\theta$ value that sets the derivative to zero.
Notice that we can rewrite the derivative as:
$$\Large \frac{\partial}{\partial \theta} f(\theta) = -\frac{1}{n}\sum_{i=1}^n \textbf{sign}(y_i - \theta) = -\frac{1}{n}\left( \sum_{y_i < \theta} -1 + \sum_{y_i > \theta} +1 \right) $$
$$\Large \frac{1}{n}\left( \sum_{y_i < \theta} -1 + \sum_{y_i > \theta} +1 \right) = 0 $$
Notice that we for the above equation to be zero we need the number of data points to the left of $\theta$ to equal the number of data points to the right of $\theta$.
thetas = np.linspace(0, 50, 200)
x = np.array([5,10,30,40])
plt.plot(thetas, avg_loss(abs_loss, thetas, x)-10, '-k', label="Absolute Loss")
plt.plot(thetas, [np.mean(-np.sign(x - u)) for u in thetas], '.', label='derivative')
plt.vlines(x, -3, -2, colors="r", label="Observations")
plt.xlabel(r'Percent Tip ($\theta$)')
plt.legend(loc=9)
plt.savefig("abslos_4.pdf")
thetas = np.linspace(0, 50, 200)
x = np.array([5,10,20, 30,40])
plt.plot(thetas, avg_loss(abs_loss, thetas, x)-10, '-k', label="Absolute Loss")
plt.plot(thetas, [np.mean(-np.sign(x - u)) for u in thetas], '.', label='derivative')
plt.vlines(x, -3, -2, colors="r", label="Observations")
plt.xlabel(r'Percent Tip ($\theta$)')
plt.legend(loc=9)
plt.savefig("abslos_5.pdf")
Notice that $\frac{\partial}{\partial \theta} f(\theta) $ is zero precisely when half of the data points are above and below $\theta$.
$$\Large \hat{\theta} = \arg\min_\theta \frac{1}{n}\sum_{i=1}^n \left| y_i - \theta \right| = \textbf{median}(\mathcal{D}) $$
Recall that the Huber loss has the form:
$$\Large f(\theta) =\frac{1}{n}\sum_{i=1}^n \begin{cases} \frac{1}{2}\left( y_i - \theta \right)^2 & \left| y_i - \theta \right| < \alpha \\ \alpha \left(\left| y_i - \theta \right| - \frac{\alpha}{2} \right) & \text{otherwise} \end{cases} $$
Taking the derivative we get:
$$\Large \frac{\partial}{\partial \theta} f(\theta) =\frac{1}{n}\sum_{i=1}^n \begin{cases} -\left(y_i - \theta \right) & \left| \theta - y_i \right| < \alpha \\ -\alpha \, \textbf{sign}(y_i - \theta) & \text{otherwise} \end{cases} $$
Unfortunately this is difficult to solve analytically. However we can get a plot of what the derivative looks like
alpha =1.0
f = lambda theta: huber_loss(theta, 3, alpha = alpha)
def huber_loss_derivative(est, y_obs, alpha=1):
d = abs_loss(est, y_obs)
return np.where(d < alpha,
est - y_obs,
alpha * np.sign(est-y_obs))
df = lambda theta: huber_loss_derivative(theta, 3.0, alpha=alpha)
thetas = np.linspace(1, 5, 1000)
plt.plot(thetas, f(thetas), 'k', label="f")
plt.plot(thetas, df(thetas), '.', label=r"$\partial f / \partial \theta$")
# plt.plot(thetas, ddf(thetas), '--', label=r"$\partial^2 f / \partial \theta^2$")
plt.plot([3],[0], 'ro', label="Minimizer")
plt.legend();
Notice that for the default $\alpha=1$ alpha value the curves look similar to the absolute loss but with smooth corners.
thetas = np.linspace(0, 50, 200)
x = np.array([5,10,30,40])
plt.plot(thetas, avg_loss(huber_loss, thetas, x)-10, '-k', label="Huber Loss")
plt.plot(thetas, avg_loss(huber_loss_derivative, thetas, x), '.', label='derivative')
plt.vlines(x, -3, -2, colors="r", label="Observations")
plt.xlabel(r'Percent Tip ($\theta$)')
plt.legend(loc=9)
plt.savefig("huberloss_4.pdf")
thetas = np.linspace(0, 50, 200)
x = np.array([5,10,20, 30,40])
plt.plot(thetas, avg_loss(huber_loss, thetas, x)-10, '-k', label="Huber Loss")
plt.plot(thetas, avg_loss(huber_loss_derivative, thetas, x), '.', label='derivative')
plt.vlines(x, -3, -2, colors="r", label="Observations")
plt.xlabel(r'Percent Tip ($\theta$)')
plt.legend(loc=9)
plt.savefig("huberloss_5.pdf")
Notice in the following figure that the larger $\alpha$ value results in a unique estimator.
alpha = 10
huber_alpha = lambda a, b: huber_loss(a,b,alpha=alpha)
huber_derivative_alpha = lambda a, b: huber_loss_derivative(a,b,alpha)
thetas = np.linspace(0, 50, 200)
x = np.array([5,10,30,40])
plt.plot(thetas, avg_loss(huber_alpha, thetas, x)-80, '-k', label="Huber Loss")
plt.plot(thetas, avg_loss(huber_derivative_alpha, thetas, x), '.', label='derivative')
plt.vlines(x, -10, -2, colors="r", label="Observations")
plt.xlabel(r'Percent Tip ($\theta$)')
plt.ylim([-20,50])
plt.legend(loc=9)
plt.savefig("huberloss_4_alpha_10.pdf")
Because the loss function is smoother and convex, the Huber loss can be efficiently minimized using convex optimization.
from scipy.optimize import minimize
f = lambda theta: data['pcttip'].apply(
lambda y: huber_loss(theta, y)).mean()
df = lambda theta: data['pcttip'].apply(
lambda y: huber_loss_derivative(theta, y)).mean()
minimize(f,0.0, jac=df)
So far we have taken a very simple model that the percentage tip is constant:
$$\Large \text{percentage tip} = \theta $$
We then defined several loss functions and used those to estimate the value of $\theta$. How can we improve this model? Recall that we do have additional information:
data.head()
What might be some alternative models:
sns.lmplot(x = "total_bill", y = "pcttip", data = data, aspect=1.6)
plt.savefig("ptip_total_bill.pdf")
def f(theta, total_bill):
return theta[0] + theta[1] * total_bill
def l2(theta):
return np.mean(squared_loss(f(theta, data['total_bill']), data['pcttip']).values)
minimize(l2, np.array([0.0,0.0]))
def l1(theta):
return np.mean(abs_loss(f(theta, data['total_bill']), data['pcttip']).values)
minimize(l1, np.array([0.0,0.0]))
def huber(theta):
return np.mean(huber_loss(f(theta, data['total_bill']), data['pcttip']))
minimize(huber, np.array([0.0,0.0]))
We can visualize the surface of our three loss functions in 3d:
# Make range of values for thetas
uvalues = np.linspace(16,22,70)
vvalues = np.linspace(-.5,0,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
# res = {}
for loss_name, loss in [("L1", l1), ("L2", l2), ("Huber", huber)]:
loss_values = np.array([loss(t) for t in thetas.T])
loss_surface = go.Surface(name=loss_name,
x=u, y=v, z=np.reshape(loss_values, u.shape),
contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
)
scene=go.Scene(
xaxis=go.XAxis(title='w0'),
yaxis=go.YAxis(title='w1'),
aspectratio=dict(x=2.,y=2., z=1.),
# camera=dict(eye=dict(x=-2, y=-2, z=2))
)
layout = go.Layout(
autosize=True,
width=800,
height=600,
scene = scene
)
ind = np.argmin(loss_values)
optimal_point = go.Scatter3d(name = "Optimal Point for " + loss_name,
x = [thetas.T[ind,0]], y = [thetas.T[ind,1]],
z = [loss_values[ind]],
marker=dict(size=10, color="red"))
py.iplot(go.Figure(data=[loss_surface, optimal_point], layout = layout))
data['isSmall'] = data['size'] < 4
sns.boxplot(x = "isSmall", y = "pcttip", data = data)
plt.savefig("small_tables_model.pdf")
sns.boxplot(x = "time", y = "pcttip", data = data)
plt.savefig("isdinner_model.pdf")
data.head()
def f(theta, data):
return (
theta[0] +
theta[1] * (data['sex'] == 'Male') +
theta[2] * (data['smoker'] == "Yes") +
theta[3] * data['size']
)
def l2(theta):
return np.mean(squared_loss(f(theta, data), data['pcttip']).values)
minimize(l2, np.zeros(4))
def l1(theta):
return np.mean(abs_loss(f(theta, data), data['pcttip']).values)
minimize(l1, np.zeros(4))
def huber(theta):
return np.mean(huber_loss(f(theta, data), data['pcttip']))
minimize(huber, np.zeros(4))
We could use the same loss framework as before and compute the best parameters for each model and then choose the model with the smallest average loss.
Warning: There are some issues with this approach that we will revisit later