In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk")
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

Logistic Regression

This is the notebook accompanies the lecture on Logistic Regression.

Notebook created by Joseph E. Gonzalez for DS100.

Real Data

For this lecture we will use the famous Wisconsin Breast Cancer Dataset which we can obtain from scikit learn.

In [2]:
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0)

This is a clear example of over-plotting. We can improve the above plot by jittering the data:

In [3]:
jitter_y = data['malignant'] + 0.1*np.random.rand(len(data['malignant'])) -0.05
points = go.Scatter(x=data['mean radius'], y = jitter_y, mode="markers", marker=dict(opacity=0.5))
tumor_layout = dict(xaxis=dict(title="Mean Radius"),yaxis=dict(title="Malignant"))
py.iplot(go.Figure(data=[points], layout=tumor_layout))

Perhaps a better way to visualize the data is using stacked histograms.

In [4]:
py.iplot(ff.create_distplot([data.loc[~data['malignant'], 'mean radius'],
                            data.loc[data['malignant'], 'mean radius']], 
                            group_labels=["Benign","Malignant"],
                           bin_size=0.5))

Always split your data into training and test groups.

Preparing the data Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.25, random_state=42)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))
Training Data Size:  426
Test Data Size:  143

We will define $X$ and $Y$ as variables containing the training features and labels.

In [6]:
X = data_tr[['mean radius']].values
Y = data_tr['malignant'].values.astype('float')








Empirical Probability

What does the empirical probability of being malignant for different sizes look like?

In [7]:
X_slices = np.linspace(8, 24, 10)
P_emp = np.array([
    np.mean(Y[np.squeeze(np.abs(X - c)) < 2.0]) for c in X_slices
])

Least Squares Model

In [8]:
import sklearn.linear_model as linear_model
least_squares_model = linear_model.LinearRegression()
least_squares_model.fit(X,Y)
Out[8]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Visualizing Least Squares and Empirical Averages

In [9]:
jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
ind_mal = least_squares_model.predict(X) > 0.5
X_plt = np.linspace(np.min(X), np.max(X), 1000)
model_line = go.Scatter(name="Least Squares",
    x=X_plt, y=least_squares_model.predict(np.array([X_plt]).T), 
    mode="lines", line=dict(color="orange"))

mal_points = go.Scatter(name="Classified as Malignant", 
                    x=np.squeeze(X[ind_mal]), y = jitter_y[ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="red"))
ben_points = go.Scatter(name="Classified as Benign", 
                    x=np.squeeze(X[~ind_mal]), y = jitter_y[~ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="blue"))
dec_boundary = (0.5 - least_squares_model.intercept_)/least_squares_model.coef_[0]
dec_line = go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[-0.5,1.5], mode="lines",
                     line=dict(color="black", dash="dot"))

p_emp_line = go.Scatter(name="Empirical Probabilities",
    x=X_slices, y=P_emp, 
    mode="lines", line=dict(color="green", width=8))
py.iplot(go.Figure(data=[mal_points, ben_points, model_line, p_emp_line, dec_line], layout=tumor_layout))

Logistic Activation Function

The logistic function (sometimes called the sigmoid function) has the above shape and is commonly used to transform a real number into a value that can be used to encode a probability. The logistic function has some interesting mathematical properties but let's start with it's form:

$$\large \sigma(t) = \frac{1}{1 + e^{-t}} $$

In [10]:
def sigmoid(t):
    return 1. / (1. + np.exp(-t))
In [11]:
t = np.linspace(-5,5,50)
sigmoid_line = go.Scatter(name="Logistic Function",
    x=t, y=sigmoid(t), mode="lines")
py.iplot([sigmoid_line])

Interactive Visualization

The following plots $\sigma(w t)$ for different $w$ values:

In [12]:
t = np.linspace(-5,5,100)

lines = []
for w in np.linspace(0.5,10,20):
    line = go.Scatter(name=str(w), x=t, y=sigmoid(w*t), mode="lines", visible=False)
    lines.append(line)

lines[0].visible = True
steps = []
for i in range(len(lines)):
    step = dict(
        label = lines[i]['name'],
        method = 'restyle',
        args = ['visible', [False] * (len(lines)+1)],
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    steps.append(step)
    
sliders = [dict(active = 0, currentvalue = {"prefix": "scale term: "}, steps = steps)]

# render the plot
slider_layout = go.Layout(xaxis=dict(range=[t.min(), t.max()]), 
                   yaxis=dict(range=[-0.5 , 1.5]),
                   showlegend=False,
                   sliders=sliders)
py.iplot(go.Figure(data = lines, layout=slider_layout))

The following plots $\sigma(5t + b)$ for different $v$ values:

In [13]:
t = np.linspace(-5,5,100)

lines = []
for b in np.linspace(-10,10,20):
    line = go.Scatter(name=str(w), x=t, y=sigmoid(5*t + b), mode="lines", visible=False)
    lines.append(line)

lines[0].visible = True
steps = []
for i in range(len(lines)):
    step = dict(
        label = lines[i]['name'],
        method = 'restyle',
        args = ['visible', [False] * (len(lines)+1)],
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    steps.append(step)
    
sliders = [dict(active = 0, currentvalue = {"prefix": "offset term: "}, steps = steps)]

# render the plot
slider_layout = go.Layout(xaxis=dict(range=[t.min(), t.max()]), 
                   yaxis=dict(range=[-0.5 , 1.5]),
                   showlegend=False,
                   sliders=sliders)
py.iplot(go.Figure(data = lines, layout=slider_layout))

Why not the squared loss

We have frequently used the squared loss. Below we plot the squared loss using the logistic regression model:

$$\Large L(\theta) = \frac{1}{n}\sum_{i=1}^n \left( y_i - \sigma\left( \phi(x_i)^T \theta\right) \right)^2 $$

Unfortunately the squared loss is not convex. We can see this by simply plotting:

In [14]:
ytmp = np.array([0, 0,1,0,1, 1])
xtmp = np.array([-4, -2,-0.5,1,3,5])

plt.plot(xtmp, ytmp, 'o')
plt.xlabel("X")
plt.ylabel("Y");
plt.savefig("toydata.pdf", bbox_inches="tight");
In [15]:
def l2loss(theta):
    return ((ytmp - sigmoid(theta * xtmp))**2).mean()
thetas = np.linspace(-5,5,100)
losses = np.array([l2loss(t) for t in thetas])
plt.plot(thetas,losses)
plt.xlabel(r"$\theta$")
plt.ylabel(r"$L(\theta)$");
plt.savefig("squaredloss.pdf", bbox_inches="tight")

KL Divergence

The Kullback–Leibler (KL) Divergence between two distribution is given by:

$$ \Large \textbf{D}(P || Q) = \sum_{k=1}^K P(k) \log \left( \frac{P(k)}{Q(k)} \right) $$

The following computes the KL divergence between two categorical distributions:

In [16]:
def kl_divergence(p, q):
    return np.sum(p * np.log(p / q))

Evaluating two toy distributions

In [17]:
p = np.array([1,1,2,3,2,1,0,0,0,1,2,1,1,0,0]) + 0.1
p = p /p.sum()
q = np.array([0,0,1,1,3,2,2,5,2,1,1,1,0,1,1])  + 0.1
q = q /q.sum()

Visualizing the KL divergence:

In [18]:
plt.figure(figsize=(10,8))
plt.subplot(3,1,1)
classes = np.arange(len(p)) + 1
plt.bar(classes, p)
plt.ylabel("P")
plt.xticks(classes);

plt.subplot(3,1,2)
plt.bar(classes, q);
plt.ylabel("Q")
plt.xticks(classes);

plt.subplot(3,1,3)
plt.bar(classes, p * np.log(p/q), color='r')
plt.xticks(classes);
plt.ylabel(r"$p \, \log\left(\frac{p}{q}\right)$")
plt.savefig('pvq.pdf', bbox_inches='tight')

Notice that the KL divergence is larger when P is big and P does not match Q. Where P is small (unlikely events) Q does not need to match P.




Cross Entropy Loss

The cross entropy loss can be obtained by minimizing the KL divergence between the observed conditional probabilities and the predicted conditional probabilities:

$$\Large \arg \min_\theta -\frac{1}{n} \sum_{i=1}^n \mathbf{P}\left(y_i = k \, | \, x_i\right)\log \left( \hat{\mathbf{P}}_\theta \left(y_i = k \, | \, x_i\right)\right) $$

For the Logistic binary classification model this reduces to:

$$\Large \arg \min_\theta -\frac{1}{n} \sum_{i=1}^n \left( y_i \phi(x_i)^T \theta + \log \left(\sigma\left(-\phi(x_i)^T \theta\right) \right) \right) $$

Which we implement below:

In [19]:
def cross_entropy(theta):
    return np.mean(-ytmp * theta *xtmp  - np.log(sigmoid(-theta * xtmp)))

Plotting this loss for the above toy example we get:

In [20]:
thetas = np.linspace(-5,5,100)
losses = np.array([cross_entropy(t) for t in thetas])
plt.plot(thetas,losses)
plt.xlabel(r"$\theta$")
plt.ylabel(r"$L(\theta)$");
plt.savefig("crossent.pdf", bbox_inches='tight')

Minimizing the Loss

There is no close form solution to minimize this loss. We therefore use gradient descent.





Gradient Descent

Recall the algorithm consists of iteratively computing:

$$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \rho(t) \left(\nabla_\theta \mathbf{L}(\theta) \biggr\rvert_{\theta = \theta^{(\tau)}}\right) $$

where $\rho(t)$ (sometimes called the learning rate) is typically:

$$ \large \rho(t) = \frac{1}{t} $$

Gradient Descent for Logistic Regression

The gradient of the loss is given by: $$\large \nabla_\theta \textbf{L}(\theta) = \frac{1}{n}\sum_{i=1}^n \left(\sigma\left(\phi(x_i)^T \theta\right) - y_i \right) \phi(x_i) $$

The logistic regression update equation is then: $$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \frac{1}{t} \left(\frac{1}{n}\sum_{i=1}^n \left(\sigma\left(\phi(x_i)^T \theta\right) - y_i \right) \phi(x_i)\right) $$

In [21]:
def gradient_descent(X, Y, theta0, gradient_function, max_iter = 1000000,  
                     epsilon=0.0001):
    theta = theta0
    for t in range(1, max_iter):
        rho = 1./t
        grad = gradient_function(theta, X, Y)
        theta = theta - rho * grad
        # Track Convergence
        if np.linalg.norm(grad) < epsilon:
            return theta
    return theta

Convergence: When to stop iterating

The gradient descent algorithm is typically run until one of the following conditions is met:

  1. $\theta$ stops changing (by more than epsilon)
  2. a maximum number of iterations are run (i.e., user gets impatient)
  3. the gradient of the objective is zero (this is always a stopping condition, why?)

Defining $\nabla_\theta \textbf{L}(\theta) $

In the following we implement the gradient function for a dataset X and Y.

In [22]:
def gradL(theta, X, Y):
    return (X.T @ (sigmoid(X @ theta) - Y)) 

In the following we standardize this dataset for numerical stability. Note that due to the exponents extreme values can be an issue.

In [23]:
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
standardizer.fit(X)
Xst = standardizer.transform(X)

We will also add a constant (bias or offset) term.

In [24]:
Phi = np.hstack([Xst,np.ones([len(Xst),1])])

Run the batch gradient descent solver

In [25]:
theta_batch = gradient_descent(Phi, Y, np.ones(2), gradL)
theta_batch
Out[25]:
array([ 3.30903971, -0.63546791])

Comparing with SK-Learn

In practice you will again use libraries with optimized implementations of these algorithms. However it is worth knowing how they work.

In [26]:
from sklearn.linear_model import LogisticRegression
lr_basic = LogisticRegression(fit_intercept=False, C=100.00)
# 1) We already added an intercept to phi
# 2) C = 1/lambda the inverse regularization parameter. 
#    by default SK learn adds regularization (we made it small)
lr_basic.fit(Phi, Y)
lr_basic.coef_
Out[26]:
array([[ 3.30482582, -0.63534454]])

The solution is slightly different because the SKLearn logistic regression model automatically adds regularization. This ensures a more stable solution and enables more advanced optimization algorithms to perform reliably.

Examining the solution

In the following we plot the logistic regression probability model and the corresponding decision boundary. To compute the decision boundary recall that from the model definition that $\textbf{P}(Y = 1 | X) = \sigma(x^T \theta)$ and therefore the probability that $\textbf{P}(Y = 1 | X) > 0.5$ occurs when $\sigma(x^T \theta) > 0.5$ and therefore $x^T \theta > \sigma^{-1}(0.5) = 0$. Note however that we standardized the features in $X$ to construct $\Phi$ and therefore after solving for $X$ at the decision boundary we will need to apply the inverse standardization (multiply by the standard deviation and add back the mean).

In [27]:
Phi_plt = np.hstack([
    standardizer.transform(np.array([X_plt]).T), 
    np.ones((len(X_plt),1))])

# The logistic regression decision boundary is when the underlying linear
# model passes through zero
lr_dec_boundary = (0.0 - theta_batch[1])/theta_batch[0]
lr_dec_boundary = standardizer.inverse_transform([lr_dec_boundary])[0]
lr_dec_line = go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[-0.5,1.5], mode="lines",
                     line=dict(color="red", dash="dot"))

lr_line = go.Scatter(name="Gradeint Descent Logistic Regression",
    x=X_plt, y=sigmoid(Phi_plt @ theta_batch), 
    mode="lines", line=dict(color="orange", width=4))
py.iplot(go.Figure(data=[points, lr_line, dec_line, lr_dec_line], layout=tumor_layout))

Adding the decision boundaries to our earlier histograms:

In [28]:
plt = ff.create_distplot([
    data[~data['malignant']]['mean radius'],
    data[data['malignant']]['mean radius']
], group_labels=['benign', 'malignant'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))
py.iplot(plt)

Comparison in Accuracy

In [29]:
from sklearn.metrics import zero_one_loss
print("Least Squares Training Prediction Error:", 
      zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
print("Logistic Regression Prediction Error:", 
      zero_one_loss(Y, sigmoid(Phi @ theta_batch) > 0.5))
Least Squares Training Prediction Error: 0.1338028169014085
Logistic Regression Prediction Error: 0.12910798122065725

What does the loss surface look like?

Let's examine the loss function surface and our solution. Recall that the loss function is the negative of the log-likelihood (minimize loss = maximize likelihood) and has the following form:

$$\Large L(\theta) = -\frac{1}{n} \sum_{i=1}^n \left( y_i \phi(x_i)^T \theta + \log \left(\sigma\left(-\phi(x_i)^T \theta\right) \right) \right) $$

We can implement this in the following python expression:

In [30]:
def lr_loss(theta, Phi, Y):
    t = Phi @ theta
    return -np.mean(Y * t - np.log(1 + np.exp(t)), axis=0)

By evaluating the loss at a grid of points we can can construct the loss surface:

In [31]:
uvalues = np.linspace(1,8,70)
vvalues = np.linspace(-5,5,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
lr_loss_values = np.array([lr_loss(t, Phi, Y) for t in thetas.T])
lr_loss_surface = go.Surface(name="Logistic Regression Loss",
        x=u, y=v, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
    )
optimal_batch_lr_point = go.Scatter3d(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]], 
        z = [lr_loss(theta_batch, Phi, Y)],
        marker=dict(size=10, color="red")
    )
py.iplot(go.Figure(data=[lr_loss_surface, optimal_batch_lr_point]))

Visualizing the gradient descent path

In the following we plot the solution path taken by gradient descent.

However, to make the plot easy to visualize (solve in fewer more consistent iterations) I introduced gradient clipping. Here I constrain the gradient norm to ensure that it is never larger than one. This prevents the solution from exploding in the first few iterations. For those who are interested in optimization techniques this is a form of proximal gradient descent.

In [32]:
def batch_gd_thetas(X, Y, theta0, grad, max_iter = 1000000,  
                    epsilon=0.01, clipping=True, rho0 = 1.):
    theta = theta0
    thetas = [theta0]
    n = len(X)
    for t in range(1, max_iter):
        rho = rho0 / t
        g = grad(theta, X, Y)
        # To make the solution (and plot) more stable 
        # I have constrained the gradient step size to
        # the unit ball.  This is sometimes called 
        # gradient clipping.
        if clipping and np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        thetas.append(theta)
        # Track Convergence
        if np.linalg.norm(g) < epsilon:
            return np.array(thetas)
    return np.array(thetas)
In [33]:
theta_path = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), gradL)

thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
                          mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(np.log(lr_loss_values), (len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True
    )

optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]],
        marker=dict(size=10, color="red")
    )

py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]))

Taking a Bigger Initial Step

In [34]:
theta_path = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), gradL, rho0=5.0)

thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
                          mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(np.log(lr_loss_values), (len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True
    )

optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]],
        marker=dict(size=10, color="red")
    )

py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point], 
                   layout=dict()))

Stochastic Gradient Descent

In the following we implement stochastic gradient descent (SGD) to solve the logistic regression problem. Afterwards we use much more efficient and robust libraries (and you should do the same). However we implement SGD here for pedagogical value.

In [35]:
def sgd(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001):
    theta = theta0
    n = len(X)
    for t in range(1, max_iter):
        rho = 1./t
        i = t % n
        g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
        if np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        # Track Convergence
        if (t % 10000) == 0:
            if np.linalg.norm(grad(theta, X, Y)) < epsilon:
                return theta
    return theta

Computing the theta values:

In [36]:
theta_sgd = sgd(Phi, Y, np.array([2.5, 2.5]), gradL)
theta_sgd
Out[36]:
array([ 3.29127697, -0.71010776])

Visualizing the SGD Solution Path

Plotting the solution path

In [37]:
def sgd_thetas(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001,
              clipping=True, rho0 = 1.):
    theta = theta0
    n = len(X)
    thetas = [theta]
    for t in range(1, max_iter):
        rho = rho0/t
        i = t % n
        g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
        if clipping and np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        thetas.append(theta)
    return np.array(thetas)
In [38]:
theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL)
In [39]:
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
        mode="lines+markers", showlegend=False
    )
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True, 
    )
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_sgd[0]], y = [theta_sgd[1]],
        marker=dict(size=10, color="red"), showlegend=False
    )
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")

Taking a larger initial step

In [40]:
theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL, rho0=5.)
In [41]:
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
        mode="lines+markers", showlegend=False
    )
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True, 
    )
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_sgd[0]], y = [theta_sgd[1]],
        marker=dict(size=10, color="red"), showlegend=False
    )
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")
In [42]:
theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL, rho0=5., batch_size=100)
In [43]:
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
        mode="lines+markers", showlegend=False
    )
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True, 
    )
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_sgd[0]], y = [theta_sgd[1]],
        marker=dict(size=10, color="red"), showlegend=False
    )
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")

Comparing the Solutions

In [44]:
plt = ff.create_distplot([
    data[~data['malignant']]['mean radius'],
    data[data['malignant']]['mean radius']
], group_labels=['benign','malignant'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))

lr_sgd_dec_boundary = (0.0 - theta_sgd[1])/theta_sgd[0]
lr_sgd_dec_boundary = standardizer.inverse_transform([lr_sgd_dec_boundary])[0]
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary", 
                      x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="blue", dash="dot")))
py.iplot(plt, filename="lr-14")

Regularization and Separability

The built in Scikit learn logistic regression models use $L^2$ regularization by default (a good idea).

  1. Why might regularization be necessary?
  2. What happens if the data are separable: there is a plane that separates one class from another?






In [45]:
Xtoy = np.hstack([-np.random.rand(20) - 0.5, np.random.rand(20) + 0.5])
Ytoy = np.hstack([np.zeros(20), np.ones(20)])
Phi_toy = np.hstack([np.array([Xtoy]).T, np.ones([len(Xtoy),1])])
In [46]:
toy_theta = gradient_descent(Phi_toy, Ytoy, 
                             np.array([1,-1]), gradL,
                             epsilon=0.0001)
toy_theta
Out[46]:
array([13.84341586,  5.12937026])

What happens if the data are separable?

In [47]:
X_toy_plt = np.hstack([
    np.array([np.linspace(-2,2,100)]).T, 
    np.ones((100,1))])
py.iplot([
    go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="blue")),
    go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="red")),
    go.Scatter(name="Logistic Regression",
    x=np.linspace(-2,2,100), y=sigmoid(X_toy_plt @ toy_theta), 
    mode="lines", line=dict(color="green", width=4))
], filename="lr-15")

What if we make theta bigger?

In [48]:
bigger_toy_theta = toy_theta*10
In [49]:
X_toy_plt = np.hstack([
    np.array([np.linspace(-2,2,200)]).T, 
    np.ones((200,1))])
py.iplot([
    go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="blue")),
    go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="red")),
    go.Scatter(name="Logistic Regression",
                x=np.linspace(-2,2,200), y=sigmoid(X_toy_plt @ toy_theta), 
                mode="lines", line=dict(color="green", width=4)),
    go.Scatter(name="Logistic Regression Bigger Theta",
        x=np.linspace(-2,2,200), y=sigmoid(X_toy_plt @ bigger_toy_theta), 
    mode="lines", line=dict(color="red", width=4))
], filename="lr-15")
In [50]:
lr_loss(bigger_toy_theta, Phi_toy, Ytoy)
Out[50]:
3.3306690738754676e-17

Scikit-Learn Logistic Regression

We will use the built in cross-validation support for logistic regression in scikit-learn.

In [51]:
lr = linear_model.LogisticRegressionCV(100)
lr.fit(X,Y)
Out[51]:
LogisticRegressionCV(Cs=100, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

Plotting the predicted probability

In [52]:
lr_line = go.Scatter(name="Scikit-LR",
    x=X_plt, y=lr.predict_proba(np.array([X_plt]).T).T[1], 
    mode="lines", line=dict(color="green", width=4))
py.iplot([points, lr_line], filename="lr-16")
In [53]:
plt = ff.create_distplot([
    data[~data['malignant']]['mean radius'],
    data[data['malignant']]['mean radius']
], group_labels=['benign', 'malignant'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary", 
                      x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="blue", dash="dot")))
lr_sk_dec_boundary = (0.0 - lr.intercept_[0])/lr.coef_[0,0]

plt.data.append(go.Scatter(name="SK Logistic Reg. Dec. Boundary", 
                      x = [lr_sk_dec_boundary,lr_sk_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="pink", dash="dot")))

py.iplot(plt)

LR Prediction accuracy

Comparing the prediction accuracy with least squares we see a bit of an improvement.

In [54]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

lr_errors = cross_val_score(linear_model.LogisticRegression(), X, Y, 
                scoring=make_scorer(zero_one_loss), cv=5)

print("LR Error:", np.mean(lr_errors))
LR Error: 0.12940199335548172

Compared to other models:

In [55]:
from sklearn.metrics import zero_one_loss
print("Least Squares Error:", 
      zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
Least Squares Error: 0.1338028169014085
In [56]:
print("Always predict 0:", zero_one_loss(Y, np.zeros(Y.shape)))
Always predict 0: 0.37089201877934275

Multiclass Prediction & Feature Engineering

We temporarily return to a simple synthetic dataset

In [57]:
np.random.seed(42)
nk = 50
Xmc = np.vstack([
    np.random.randn(nk,2) + np.array([3,1]),
    np.random.randn(nk,2) + np.array([1,3]),
    0.5 * np.random.randn(nk,2) ,
    0.5 * np.random.randn(nk,2) + np.array([3,3])
])
Ymc = np.hstack([np.zeros(nk), np.ones(nk), 2*np.ones(nk),2*np.ones(nk)])

blue_points = go.Scatter(name = "A", x=Xmc[Ymc==0,0], y=Xmc[Ymc==0,1], 
               mode="markers", marker=dict(opacity=0.5, color="blue"))
red_points  = go.Scatter(name = "B", x=Xmc[Ymc==1,0], y=Xmc[Ymc==1,1], 
               mode="markers", marker=dict(opacity=0.5, color="red"))
orange_points = go.Scatter(name = "C", x=Xmc[Ymc==2,0], y=Xmc[Ymc==2,1], 
               mode="markers", marker=dict(opacity=0.5, color="orange"))



py.iplot([blue_points, red_points, orange_points], filename="lr-18")

Train a Soft-Max Classifier

Here we train a soft-max classifier. Currently scikit learn requires that we use an LBFGS solver for soft-max. L-BFGS is a quasi second order method that uses the both the gradient and a diagonal approximation of the second derivative (the Hessian) to solve for the optimal parameters.

In [58]:
lr = linear_model.LogisticRegression(multi_class="multinomial", solver="lbfgs")
lr.fit(Xmc,Ymc)
Out[58]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Plotting the Decision Surface

In the following we plot the corresponding decision surface generated by the basic multiclass logistic regression model. Notice that the limitations of the linear decision surface.

In [59]:
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.vstack((u.flatten(),v.flatten())).T
label = lr.predict(coords)
label_contour = go.Contour(
    name="Label ContourSurace",
    x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
    colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]))

Feature Engineering with Gaussian RBFs

To capture the non-linear structure we randomly sample 20 data points and create an RBF feature centered at that point.

In [60]:
def gaussian_rbf(us, lam=1):
    return lambda x: np.array([np.exp(-np.linalg.norm(x - u)**2 / lam**2) for u in us])

num_basis = 20
np.random.seed(42)
rbf_features = gaussian_rbf(Xmc[np.random.choice(range(len(Xmc)), num_basis, replace=False),:])
Phimc = np.array([rbf_features(x) for x in Xmc])

lr_rbf = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_rbf.fit(Phimc,Ymc)
Out[60]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Again fitting the logistic regression model we get a non-linear decision surface in our original space but a linear decision surface in a higher dimensional transformed space.

In [61]:
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
label = lr_rbf.predict(coords)
label_contour = go.Contour(
    name="Label ContourSurace",
    x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
    colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]), 
         filename="lr-20")
In [62]:
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
prb = lr_rbf.predict_proba(coords)
In [63]:
from plotly import tools
fig = tools.make_subplots(rows=1, cols=3,
                          specs=[[{'is_3d': True}, {'is_3d': True}, {'is_3d': True}]])

for i in range(3):
    surf = go.Surface(name="Class " + str(i+1),
                      x=u, y=v, z=np.reshape(prb[:,i],(len(uvalues), len(vvalues))), 
                      showscale = False)
    fig.append_trace(surf, 1, i+1)
# fig['layout'].update(height=600, width=600, title='i <3 subplots')
py.iplot(fig)
This is the format of your plot grid:
[ (1,1) scene1 ]  [ (1,2) scene2 ]  [ (1,3) scene3 ]