import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk")
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=False)

import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf

cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

Logistic Regression¶

This is the notebook accompanies the lecture on Logistic Regression.

Notebook created by Joseph E. Gonzalez for DS100.

Real Data¶

For this lecture we will use the famous Wisconsin Breast Cancer Dataset which we can obtain from scikit learn.

import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0)

This is a clear example of over-plotting. We can improve the above plot by jittering the data:

jitter_y = data['malignant'] + 0.1*np.random.rand(len(data['malignant'])) -0.05
points = go.Scatter(x=data['mean radius'], y = jitter_y, mode="markers", marker=dict(opacity=0.5))
tumor_layout = dict(xaxis=dict(title="Mean Radius"),yaxis=dict(title="Malignant"))
py.iplot(go.Figure(data=[points], layout=tumor_layout))

Perhaps a better way to visualize the data is using stacked histograms.

py.iplot(ff.create_distplot([data.loc[~data['malignant'], 'mean radius'],
                            data.loc[data['malignant'], 'mean radius']], 
                            group_labels=["Benign","Malignant"],
                           bin_size=0.5))

Always split your data into training and test groups.

Preparing the data Train-Test Split¶

from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.25, random_state=42)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))

Training Data Size:  426
Test Data Size:  143

We will define $X$ and $Y$ as variables containing the training features and labels.

X = data_tr[['mean radius']].values
Y = data_tr['malignant'].values.astype('float')

Empirical Probability¶

What does the empirical probability of being malignant for different sizes look like?

X_slices = np.linspace(8, 24, 10)
P_emp = np.array([
    np.mean(Y[np.squeeze(np.abs(X - c)) < 2.0]) for c in X_slices
])

Least Squares Model¶

import sklearn.linear_model as linear_model
least_squares_model = linear_model.LinearRegression()
least_squares_model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Visualizing Least Squares and Empirical Averages¶

jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
ind_mal = least_squares_model.predict(X) > 0.5
X_plt = np.linspace(np.min(X), np.max(X), 1000)
model_line = go.Scatter(name="Least Squares",
    x=X_plt, y=least_squares_model.predict(np.array([X_plt]).T), 
    mode="lines", line=dict(color="orange"))

mal_points = go.Scatter(name="Classified as Malignant", 
                    x=np.squeeze(X[ind_mal]), y = jitter_y[ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="red"))
ben_points = go.Scatter(name="Classified as Benign", 
                    x=np.squeeze(X[~ind_mal]), y = jitter_y[~ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="blue"))
dec_boundary = (0.5 - least_squares_model.intercept_)/least_squares_model.coef_[0]
dec_line = go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[-0.5,1.5], mode="lines",
                     line=dict(color="black", dash="dot"))

p_emp_line = go.Scatter(name="Empirical Probabilities",
    x=X_slices, y=P_emp, 
    mode="lines", line=dict(color="green", width=8))
py.iplot(go.Figure(data=[mal_points, ben_points, model_line, p_emp_line, dec_line], layout=tumor_layout))

Logistic Activation Function¶

The logistic function (sometimes called the sigmoid function) has the above shape and is commonly used to transform a real number into a value that can be used to encode a probability. The logistic function has some interesting mathematical properties but let's start with it's form:

$$\large \sigma(t) = \frac{1}{1 + e^{-t}} $$

def sigmoid(t):
    return 1. / (1. + np.exp(-t))

t = np.linspace(-5,5,50)
sigmoid_line = go.Scatter(name="Logistic Function",
    x=t, y=sigmoid(t), mode="lines")
py.iplot([sigmoid_line])

Interactive Visualization¶

The following plots $\sigma(w t)$ for different $w$ values:

t = np.linspace(-5,5,100)

lines = []
for w in np.linspace(0.5,10,20):
    line = go.Scatter(name=str(w), x=t, y=sigmoid(w*t), mode="lines", visible=False)
    lines.append(line)

lines[0].visible = True
steps = []
for i in range(len(lines)):
    step = dict(
        label = lines[i]['name'],
        method = 'restyle',
        args = ['visible', [False] * (len(lines)+1)],
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    steps.append(step)
    
sliders = [dict(active = 0, currentvalue = {"prefix": "scale term: "}, steps = steps)]

# render the plot
slider_layout = go.Layout(xaxis=dict(range=[t.min(), t.max()]), 
                   yaxis=dict(range=[-0.5 , 1.5]),
                   showlegend=False,
                   sliders=sliders)
py.iplot(go.Figure(data = lines, layout=slider_layout))

The following plots $\sigma(5t + b)$ for different $v$ values:

t = np.linspace(-5,5,100)

lines = []
for b in np.linspace(-10,10,20):
    line = go.Scatter(name=str(w), x=t, y=sigmoid(5*t + b), mode="lines", visible=False)
    lines.append(line)

lines[0].visible = True
steps = []
for i in range(len(lines)):
    step = dict(
        label = lines[i]['name'],
        method = 'restyle',
        args = ['visible', [False] * (len(lines)+1)],
    )
    step['args'][1][i] = True # Toggle i'th trace to "visible"
    steps.append(step)
    
sliders = [dict(active = 0, currentvalue = {"prefix": "offset term: "}, steps = steps)]

# render the plot
slider_layout = go.Layout(xaxis=dict(range=[t.min(), t.max()]), 
                   yaxis=dict(range=[-0.5 , 1.5]),
                   showlegend=False,
                   sliders=sliders)
py.iplot(go.Figure(data = lines, layout=slider_layout))

Gradient Descent¶

Recall the algorithm consists of iteratively computing:

$$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \rho(t) \left(\nabla_\theta \mathbf{L}(\theta) \biggr\rvert_{\theta = \theta^{(\tau)}}\right) $$

where $\rho(t)$ (sometimes called the learning rate) is typically:

$$ \large \rho(t) = \frac{1}{t} $$

Gradient Descent for Logistic Regression¶

The loss function for logistic regression is:

\begin{align} \large\textbf{L}(\theta) &= \large-\frac{1}{n}\sum_{i=1}^n y_i \log\left(\sigma(\phi(x_i)^T \theta) \right) + (1-y_i) \log\left(1 - \sigma(\phi(x_i)^T\theta)\right)\\ & \large = -\frac{1}{n}\sum_{i=1}^n y_i \phi(x_i)^T \theta + \log\left(\sigma(-\phi(x_i)^T\theta) \right) \end{align}

The gradient of the loss is given by: $$\large \nabla_\theta \textbf{L}(\theta) = \frac{1}{n}\sum_{i=1}^n \left(\sigma\left(\phi(x_i)^T \theta\right) - y_i \right) \phi(x_i) $$

The logistic regression update equation is then: $$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \frac{1}{t} \left(\frac{1}{n}\sum_{i=1}^n \left(\sigma\left(\phi(x_i)^T \theta\right) - y_i \right) \phi(x_i)\right) $$

def gradient_descent(X, Y, theta0, gradient_function, max_iter = 1000000,  
                     epsilon=0.0001):
    theta = theta0
    for t in range(1, max_iter):
        rho = 1./t
        grad = gradient_function(theta, X, Y)
        theta = theta - rho * grad
        # Track Convergence
        if np.linalg.norm(grad) < epsilon:
            return theta
    return theta

Convergence: When to stop iterating¶

The gradient descent algorithm is typically run until one of the following conditions is met:

$\theta$ stops changing (by more than epsilon)
a maximum number of iterations are run (i.e., user gets impatient)
the gradient of the objective is zero (this is always a stopping condition, why?)

Defining $\nabla_\theta \textbf{L}(\theta) $¶

In the following we implement the gradient function for a dataset X and Y.

def gradL(theta, X, Y):
    return (X.T @ (sigmoid(X @ theta) - Y))

In the following we standardize this dataset for numerical stability. Note that due to the exponents extreme values can be an issue.

from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
standardizer.fit(X)
Xst = standardizer.transform(X)

We will also add a constant (bias or offset) term.

Phi = np.hstack([Xst,np.ones([len(Xst),1])])

Run the batch gradient descent solver

theta_batch = gradient_descent(Phi, Y, np.ones(2), gradL)
theta_batch

array([ 3.30903971, -0.63546791])

Comparing with SK-Learn¶

In practice you will again use libraries with optimized implementations of these algorithms. However it is worth knowing how they work.

from sklearn.linear_model import LogisticRegression
lr_basic = LogisticRegression(fit_intercept=False, C=100.00)
# 1) We already added an intercept to phi
# 2) C = 1/lambda the inverse regularization parameter. 
#    by default SK learn adds regularization (we made it small)
lr_basic.fit(Phi, Y)
lr_basic.coef_

array([[ 3.30482582, -0.63534454]])

The solution is slightly different because the SKLearn logistic regression model automatically adds regularization. This ensures a more stable solution and enables more advanced optimization algorithms to perform reliably.

Examining the solution¶

In the following we plot the logistic regression probability model and the corresponding decision boundary. To compute the decision boundary recall that from the model definition that $\textbf{P}(Y = 1 | X) = \sigma(x^T \theta)$ and therefore the probability that $\textbf{P}(Y = 1 | X) > 0.5$ occurs when $\sigma(x^T \theta) > 0.5$ and therefore $x^T \theta > \sigma^{-1}(0.5) = 0$. Note however that we standardized the features in $X$ to construct $\Phi$ and therefore after solving for $X$ at the decision boundary we will need to apply the inverse standardization (multiply by the standard deviation and add back the mean).

Phi_plt = np.hstack([
    standardizer.transform(np.array([X_plt]).T), 
    np.ones((len(X_plt),1))])

# The logistic regression decision boundary is when the underlying linear
# model passes through zero
lr_dec_boundary = (0.0 - theta_batch[1])/theta_batch[0]
lr_dec_boundary = standardizer.inverse_transform([lr_dec_boundary])[0]
lr_dec_line = go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[-0.5,1.5], mode="lines",
                     line=dict(color="red", dash="dot"))

lr_line = go.Scatter(name="Gradeint Descent Logistic Regression",
    x=X_plt, y=sigmoid(Phi_plt @ theta_batch), 
    mode="lines", line=dict(color="orange", width=4))
py.iplot(go.Figure(data=[points, lr_line, dec_line, lr_dec_line], layout=tumor_layout))

Adding the decision boundaries to our earlier histograms:

plt = ff.create_distplot([
    data[~data['malignant']]['mean radius'],
    data[data['malignant']]['mean radius']
], group_labels=['benign', 'malignant'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))
py.iplot(plt)

Comparison in Accuracy¶

from sklearn.metrics import zero_one_loss
print("Least Squares Training Prediction Error:", 
      zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
print("Logistic Regression Prediction Error:", 
      zero_one_loss(Y, sigmoid(Phi @ theta_batch) > 0.5))

Least Squares Training Prediction Error: 0.133802816901
Logistic Regression Prediction Error: 0.129107981221

What does the loss surface look like?¶

Let's examine the loss function surface and our solution. Recall that the loss function is the negative of the log-likelihood (minimize loss = maximize likelihood) and has the following form:

\begin{align} \large\textbf{L}(\theta) &= \large-\frac{1}{n}\sum_{i=1}^n y_i \log\left(\sigma(\phi(x_i)^T \theta) \right) + (1-y_i) \log\left(1 - \sigma(\phi(x_i)^T\theta)\right)\\ & \large = -\frac{1}{n}\sum_{i=1}^n y_i \phi(x_i)^T \theta + \log\left(\sigma(-\phi(x_i)^T\theta) \right) \end{align}

We can implement this in the following python expression:

def lr_loss(theta, Phi, Y):
    t = Phi @ theta
    return -np.mean(Y * t - np.log(1 + np.exp(t)), axis=0)

By evaluating the loss at a grid of points we can can construct the loss surface:

uvalues = np.linspace(1,8,70)
vvalues = np.linspace(-5,5,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
lr_loss_values = np.array([lr_loss(t, Phi, Y) for t in thetas.T])
lr_loss_surface = go.Surface(name="Logistic Regression Loss",
        x=u, y=v, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
    )
optimal_batch_lr_point = go.Scatter3d(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]], 
        z = [lr_loss(theta_batch, Phi, Y)],
        marker=dict(size=10, color="red")
    )
py.iplot(go.Figure(data=[lr_loss_surface, optimal_batch_lr_point]))

Visualizing the gradient descent path¶

In the following we plot the solution path taken by gradient descent.

However, to make the plot easy to visualize (solve in fewer more consistent iterations) I introduced gradient clipping. Here I constrain the gradient norm to ensure that it is never larger than one. This prevents the solution from exploding in the first few iterations. For those who are interested in optimization techniques this is a form of proximal gradient descent.

def batch_gd_thetas(X, Y, theta0, grad, max_iter = 1000000,  
                    epsilon=0.01, clipping=True, rho0 = 1.):
    theta = theta0
    thetas = [theta0]
    n = len(X)
    for t in range(1, max_iter):
        rho = rho0 / t
        g = grad(theta, X, Y)
        # To make the solution (and plot) more stable 
        # I have constrained the gradient step size to
        # the unit ball.  This is sometimes called 
        # gradient clipping.
        if clipping and np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        thetas.append(theta)
        # Track Convergence
        if np.linalg.norm(g) < epsilon:
            return np.array(thetas)
    return np.array(thetas)

theta_path = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), gradL)

thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
                          mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(np.log(lr_loss_values), (len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True
    )

optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]],
        marker=dict(size=10, color="red")
    )

py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]))

Taking a Bigger Initial Step¶

theta_path = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), gradL, rho0=5.0)

thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
                          mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(np.log(lr_loss_values), (len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True
    )

optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]],
        marker=dict(size=10, color="red")
    )

py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point], 
                   layout=dict()))

Stochastic Gradient Descent¶

In the following we implement stochastic gradient descent (SGD) to solve the logistic regression problem. Afterwards we use much more efficient and robust libraries (and you should do the same). However we implement SGD here for pedagogical value.

def sgd(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001):
    theta = theta0
    n = len(X)
    for t in range(1, max_iter):
        rho = 1./t
        i = t % n
        g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
        if np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        # Track Convergence
        if (t % 10000) == 0:
            if np.linalg.norm(grad(theta, X, Y)) < epsilon:
                return theta
    return theta

Computing the theta values:

theta_sgd = sgd(Phi, Y, np.array([2.5, 2.5]), gradL)
theta_sgd

array([ 3.29127697, -0.71010776])

Visualizing the SGD Solution Path¶

Plotting the solution path

def sgd_thetas(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001,
              clipping=True, rho0 = 1.):
    theta = theta0
    n = len(X)
    thetas = [theta]
    for t in range(1, max_iter):
        rho = rho0/t
        i = t % n
        g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
        if clipping and np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        thetas.append(theta)
    return np.array(thetas)

theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL)

thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
        mode="lines+markers", showlegend=False
    )
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True, 
    )
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_sgd[0]], y = [theta_sgd[1]],
        marker=dict(size=10, color="red"), showlegend=False
    )
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")

Taking a larger initial step¶

theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL, rho0=5.)

thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
        mode="lines+markers", showlegend=False
    )
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True, 
    )
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_sgd[0]], y = [theta_sgd[1]],
        marker=dict(size=10, color="red"), showlegend=False
    )
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")

theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL, rho0=5., batch_size=100)

thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
        mode="lines+markers", showlegend=False
    )
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True, 
    )
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_sgd[0]], y = [theta_sgd[1]],
        marker=dict(size=10, color="red"), showlegend=False
    )
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")

Comparing the Solutions¶

plt = ff.create_distplot([
    data[~data['malignant']]['mean radius'],
    data[data['malignant']]['mean radius']
], group_labels=['benign','malignant'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))

lr_sgd_dec_boundary = (0.0 - theta_sgd[1])/theta_sgd[0]
lr_sgd_dec_boundary = standardizer.inverse_transform([lr_sgd_dec_boundary])[0]
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary", 
                      x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="blue", dash="dot")))
py.iplot(plt, filename="lr-14")

Regularization and Separability¶

The built in Scikit learn logistic regression models use $L^2$ regularization by default (a good idea).

Why might regularization be necessary?
What happens if the data are separable: there is a plane that separates one class from another?

Xtoy = np.hstack([-np.random.rand(20) - 0.5, np.random.rand(20) + 0.5])
Ytoy = np.hstack([np.zeros(20), np.ones(20)])
Phi_toy = np.hstack([np.array([Xtoy]).T, np.ones([len(Xtoy),1])])

toy_theta = gradient_descent(Phi_toy, Ytoy, 
                             np.array([1,-1]), gradL,
                             epsilon=0.0001)
toy_theta

array([ 14.35557319,   3.85433065])

What happens if the data are separable?

X_toy_plt = np.hstack([
    np.array([np.linspace(-2,2,100)]).T, 
    np.ones((100,1))])
py.iplot([
    go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="blue")),
    go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="red")),
    go.Scatter(name="Logistic Regression",
    x=np.linspace(-2,2,100), y=sigmoid(X_toy_plt @ toy_theta), 
    mode="lines", line=dict(color="green", width=4))
], filename="lr-15")

What if we make theta bigger?¶

bigger_toy_theta = toy_theta*10

X_toy_plt = np.hstack([
    np.array([np.linspace(-2,2,200)]).T, 
    np.ones((200,1))])
py.iplot([
    go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="blue")),
    go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="red")),
    go.Scatter(name="Logistic Regression",
                x=np.linspace(-2,2,200), y=sigmoid(X_toy_plt @ toy_theta), 
                mode="lines", line=dict(color="green", width=4)),
    go.Scatter(name="Logistic Regression Bigger Theta",
        x=np.linspace(-2,2,200), y=sigmoid(X_toy_plt @ bigger_toy_theta), 
    mode="lines", line=dict(color="red", width=4))
], filename="lr-15")

lr_loss(bigger_toy_theta, Phi_toy, Ytoy)

5.5511151231257822e-18

Scikit-Learn Logistic Regression¶

We will use the built in cross-validation support for logistic regression in scikit-learn.

lr = linear_model.LogisticRegressionCV(100)
lr.fit(X,Y)

LogisticRegressionCV(Cs=100, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

Plotting the predicted probability¶

lr_line = go.Scatter(name="Scikit-LR",
    x=X_plt, y=lr.predict_proba(np.array([X_plt]).T).T[1], 
    mode="lines", line=dict(color="green", width=4))
py.iplot([points, lr_line], filename="lr-16")

plt = ff.create_distplot([
    data[~data['malignant']]['mean radius'],
    data[data['malignant']]['mean radius']
], group_labels=['benign', 'malignant'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary", 
                      x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="blue", dash="dot")))
lr_sk_dec_boundary = (0.0 - lr.intercept_[0])/lr.coef_[0,0]

plt.data.append(go.Scatter(name="SK Logistic Reg. Dec. Boundary", 
                      x = [lr_sk_dec_boundary,lr_sk_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="pink", dash="dot")))

py.iplot(plt)

LR Prediction accuracy¶

Comparing the prediction accuracy with least squares we see a bit of an improvement.

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

lr_errors = cross_val_score(linear_model.LogisticRegression(), X, Y, 
                scoring=make_scorer(zero_one_loss), cv=5)

print("LR Error:", np.mean(lr_errors))

LR Error: 0.129401993355

Compared to other models:

from sklearn.metrics import zero_one_loss
print("Least Squares Error:", 
      zero_one_loss(Y, least_squares_model.predict(X) > 0.5))

Least Squares Error: 0.133802816901

print("Always predict 0:", zero_one_loss(Y, np.zeros(Y.shape)))

Always predict 0: 0.370892018779

Multiclass Prediction & Feature Engineering¶

We temporarily return to a simple synthetic dataset

np.random.seed(42)
nk = 50
Xmc = np.vstack([
    np.random.randn(nk,2) + np.array([3,1]),
    np.random.randn(nk,2) + np.array([1,3]),
    0.5 * np.random.randn(nk,2) ,
    0.5 * np.random.randn(nk,2) + np.array([3,3])
])
Ymc = np.hstack([np.zeros(nk), np.ones(nk), 2*np.ones(nk),2*np.ones(nk)])

blue_points = go.Scatter(name = "A", x=Xmc[Ymc==0,0], y=Xmc[Ymc==0,1], 
               mode="markers", marker=dict(opacity=0.5, color="blue"))
red_points  = go.Scatter(name = "B", x=Xmc[Ymc==1,0], y=Xmc[Ymc==1,1], 
               mode="markers", marker=dict(opacity=0.5, color="red"))
orange_points = go.Scatter(name = "C", x=Xmc[Ymc==2,0], y=Xmc[Ymc==2,1], 
               mode="markers", marker=dict(opacity=0.5, color="orange"))



py.iplot([blue_points, red_points, orange_points], filename="lr-18")

Train a Soft-Max Classifier¶

Here we train a soft-max classifier. Currently scikit learn requires that we use an LBFGS solver for soft-max. L-BFGS is a quasi second order method that uses the both the gradient and a diagonal approximation of the second derivative (the Hessian) to solve for the optimal parameters.

lr = linear_model.LogisticRegression(multi_class="multinomial", solver="lbfgs")
lr.fit(Xmc,Ymc)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Plotting the Decision Surface¶

In the following we plot the corresponding decision surface generated by the basic multiclass logistic regression model. Notice that the limitations of the linear decision surface.

uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.vstack((u.flatten(),v.flatten())).T
label = lr.predict(coords)
label_contour = go.Contour(
    name="Label ContourSurace",
    x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
    colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]))

Feature Engineering with Gaussian RBFs¶

To capture the non-linear structure we randomly sample 20 data points and create an RBF feature centered at that point.

def gaussian_rbf(us, lam=1):
    return lambda x: np.array([np.exp(-np.linalg.norm(x - u)**2 / lam**2) for u in us])

num_basis = 20
np.random.seed(42)
rbf_features = gaussian_rbf(Xmc[np.random.choice(range(len(Xmc)), num_basis, replace=False),:])
Phimc = np.array([rbf_features(x) for x in Xmc])

lr_rbf = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_rbf.fit(Phimc,Ymc)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Again fitting the logistic regression model we get a non-linear decision surface in our original space but a linear decision surface in a higher dimensional transformed space.

uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
label = lr_rbf.predict(coords)
label_contour = go.Contour(
    name="Label ContourSurace",
    x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
    colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]), 
         filename="lr-20")

uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
prb = lr_rbf.predict_proba(coords)

from plotly import tools
fig = tools.make_subplots(rows=1, cols=3,
                          specs=[[{'is_3d': True}, {'is_3d': True}, {'is_3d': True}]])

for i in range(3):
    surf = go.Surface(name="Class " + str(i+1),
                      x=u, y=v, z=np.reshape(prb[:,i],(len(uvalues), len(vvalues))), 
                      showscale = False)
    fig.append_trace(surf, 1, i+1)
# fig['layout'].update(height=600, width=600, title='i <3 subplots')
py.iplot(fig)