import numpy as np
import pandas as pd

# Plotly plotting support
import plotly.plotly as py
# import plotly.offline as py
# py.init_notebook_mode()

# import cufflinks as cf
# cf.go_offline() # required to use plotly offline (no account required).

import plotly.graph_objs as go
import plotly.figure_factory as ff

# Make the notebook deterministic 
np.random.seed(42)

Logistic Regression¶

This is the notebook accompanies the lecture on Logistic Regression.

Notebook created by Joseph E. Gonzalez for DS100.

Real Data¶

For this lecture we will use the famous Wisconsin Breast Cancer Dataset which we can obtain from scikit learn.

import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0)

data.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'malignant'],
      dtype='object')

points = go.Scatter(x=data['mean radius'], y = 1.*data['malignant'], mode="markers")
py.iplot([points], filename="lr-01")

This is a clear example of over-plotting. We can improve the above plot by jittering the data:

jitter_y = data['malignant'] + 0.1*np.random.rand(len(data['malignant'])) -0.05
points = go.Scatter(x=data['mean radius'], y = jitter_y, mode="markers", marker=dict(opacity=0.5))
py.iplot([points], filename="lr-02")

We can also get a picture of the data distribution:

py.iplot(ff.create_distplot([
    data[data['malignant']]['mean radius'],
    data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'],  bin_size=0.5), filename="lr-03")

Question: Looking at the above histograms could you describe a rule to predict whether or a cell is malignant?

Least Squares Regression¶

"I suppose it is tempting, if the only tool you have is a hammer, to treat everything as if it were a nail." - Abraham Maslow The Psychology of Science

Goal: We would like to predict whether the tumor is malignant from the size of the tumor.

Always split your data into training and test groups.

Preparing the data Train-Test Split¶

from sklearn.model_selection import train_test_split

data_tr, data_te = train_test_split(data, test_size=0.25, random_state=42)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))

Training Data Size:  426
Test Data Size:  143

We will define $X$ and $Y$ as variables containing the training features and labels.

X = np.array([data_tr['mean radius']]).T
Y = data_tr['malignant'].values.astype('float')

Fit a least squares regression model.

import sklearn.linear_model as linear_model

least_squares_model = linear_model.LinearRegression()
least_squares_model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

How is our fit?¶

jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
points = go.Scatter(name="Jittered Data", 
                    x=np.squeeze(X), y = jitter_y, 
                    mode="markers", marker=dict(opacity=0.5))
X_plt = np.linspace(np.min(X), np.max(X), 10)
model_line = go.Scatter(name="Least Squares",
    x=X_plt, y=least_squares_model.predict(np.array([X_plt]).T), 
    mode="lines", line=dict(color="orange"))
py.iplot([points, model_line], filename="lr-04")

Questions:¶

Are we happy with the fit?
What is the meaning of predictions that are neither 0 or 1?
Could we use this to make a decision?

What is the Root Mean Squared Error?¶

from sklearn.metrics import mean_squared_error as mse
print("Training RMSE:", np.sqrt(mse(Y, least_squares_model.predict(X))))

Training RMSE: 0.338350570944

Cross Validation¶

Maybe we should also look at the cross validated error.

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

rmse_scores = np.sqrt(
    cross_val_score(linear_model.LinearRegression(), X, Y, 
                    scoring=make_scorer(mse), cv=5))

print("Min Validation Error:   ", np.min(rmse_scores))
print("Median Validation Error:", np.median(rmse_scores))
print("Max Validation Error:   ", np.max(rmse_scores))

Min Validation Error:    0.296672238481
Median Validation Error: 0.337995623696
Max Validation Error:    0.395474238206

Questions?¶

Are we satisfied with this error value?
Is this the right measure of error?

Classification Error¶

This is a classification problem so we probably want to measure how often we predict the correct value. This is sometimes called the zero-one loss (or error):

$$ \large \textbf{ZeroOneLoss} = \frac{1}{n} \sum_{i=1}^n \textbf{I}\left[ y_i == f_\theta(x) \right] $$

However to use the classification error we need to define a decision rule that maps $f_\theta(x)$ to the $\{0,1\}$ classification values.

Simple Decision Rule¶

Suppose we instituted the following simple decision rule:

` If ` $f_\theta(x) > 0.5$ `predict 1 else predict 0 `

This simple decision rule is deciding that a tumor is malignant if our model predicts a values above 0.5 (closer to 1 than zero).

In the following we plot the implication of these decisions on our training data.

jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
ind_mal = least_squares_model.predict(X) > 0.5

mal_points = go.Scatter(name="Classified as Malignant", 
                    x=np.squeeze(X[ind_mal]), y = jitter_y[ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="red"))
ben_points = go.Scatter(name="Classified as Benign", 
                    x=np.squeeze(X[~ind_mal]), y = jitter_y[~ind_mal], 
                    mode="markers", marker=dict(opacity=0.5, color="blue"))
dec_boundary = (0.5 - least_squares_model.intercept_)/least_squares_model.coef_[0]
dec_line = go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[-0.5,1.5], mode="lines",
                     line=dict(color="black", dash="dot"))
py.iplot([mal_points, ben_points, model_line,dec_line], filename="lr-05")

Compute `ZeroOneLoss`¶

from sklearn.metrics import zero_one_loss
print("Training Fraction incorrect:", 
      zero_one_loss(Y, least_squares_model.predict(X) > 0.5))

Training Fraction incorrect: 0.133802816901

Questions

Are we happy with this error level?
What error would we get if we just guessed the label?

Guessing the Majority Class¶

This is the simplest baseline we could imagine and one you should always compare against. Let's start by asking what is the majority class

print("Fraction of Malignant Samples:", np.mean(Y))

Fraction of Malignant Samples: 0.370892018779

Therefore if we guess the majority class benign we would get what accuracy?

# You can figure this out from the above number
# print("Guess Majority:",  zero_one_loss(Y, np.zeros(len(Y))))

This is standard example of a common problem in classification (and perhaps modern society): class imbalance.

Class Imbalance¶

Class imbalance is when a disproportionate fraction of the samples are in one class (in this case benign). In extreme cases (e.g., fraud detection) only tiny fraction of the training data may contain examples in particular class. In these settings we can achieve very high-accuracies by always predicting the frequent class without learning a good classifier for the rare classes.

Addressing Class Imbalance¶

There are many techniques for managing class imbalance here are a few:

Re-sample data to reduce or eliminate the class imbalance.
Try learning algorithm that are a little more robust to class imbalance (e.g., decisions trees).

In this example the class imbalance is not that extreme so we will continue without re-sampling.

Cross Validation of Zero-One Error¶

from sklearn.model_selection import KFold
kfold = KFold(3,shuffle=True, random_state=42)
linreg_errors = []
models = []
for tr_ind, te_ind in kfold.split(X):
    model = linear_model.LinearRegression()
    model.fit(X[tr_ind,], Y[tr_ind])
    models.append(model)
    linreg_errors.append(zero_one_loss(Y[te_ind], model.predict(X[te_ind,]) > 0.5))
    
print("Min Validation Error:   ", np.min(linreg_errors))
print("Median Validation Error:", np.median(linreg_errors))
print("Max Validation Error:   ", np.max(linreg_errors))

Min Validation Error:    0.0845070422535
Median Validation Error: 0.161971830986
Max Validation Error:    0.176056338028

Pretty broad range?¶

Not very robust ...

# dec_lines = [
#     go.Scatter(name="Decision Boundary", 
#                x = [(0.5 - m.intercept_)/m.coef_[0]]*2, 
#                y=[-0.5,1.5], mode="lines",
#                line=dict(dash="dot"))
#     for m in models]

# X_plt = np.linspace(np.min(X), np.max(X), 10)
# model_lines = [
#     go.Scatter(name="Least Squares " + str(zero_one_loss(Y, m.predict(X) > 0.5)),
#                x=X_plt, y=m.predict(np.array([X_plt]).T), 
#                mode="lines")
#     for m in models]
# py.iplot([points] + model_lines + dec_lines)

Can we think of the line as a "probability"?¶

Not really. Probabilities are constrained between 0 and 1. How could we learn a model that captures this probabilistic interpretation?

Could we just truncate the line?¶

Maybe we can define the probability as:

$$ \large p_i = \min\left(\max \left( x^T \theta , 0 \right), 1\right) $$

this would look like:

def bound01(z):
    u = np.where(z > 1, 1, z)
    return np.where(u < 0, 0, u)

X_plt = np.linspace(np.min(X), np.max(X), 100)
p_line = go.Scatter(name="Truncated Least Squares",
    x=X_plt, y=bound01(least_squares_model.predict(np.array([X_plt]).T)), 
    mode="lines", line=dict(color="green", width=8))
py.iplot([mal_points, ben_points, model_line, p_line, dec_line], filename="lr-06")

Not Satisfied Yet?¶

So far least squares regression seems pretty reasonable and we can "force" the model values to be bounded between 0 and 1 (a must for probabilities).

Can we interpret the truncated values as probabilities perhaps but it would depend on how the model is estimated (more on this soon).

Issue with Extreme Points¶

It seems like large tumor sizes are indicative of malignant tumors. Suppose we measured a very large tumor at 100mm that is also malignant. What would this do to our model?

Let's add an extra data point and see what happens:

X_ex = np.vstack([X, [100]])
Y_ex = np.hstack([Y, 1.])
least_squares_model_ex = linear_model.LinearRegression()
least_squares_model_ex.fit(X_ex, Y_ex)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

X_plt = np.linspace(np.min(X), np.max(X), 100)

extreme_point = go.Scatter(
    name="Extreme Point", x=[100], y=[1], mode="markers", 
    marker=dict(color="black", size=10))
model_line.line.color = "gray"
model_line_ex = go.Scatter(name="New Least Squares",
    x=X_plt, y=least_squares_model_ex.predict(np.array([X_plt]).T), 
    mode="lines", line=dict(color="orange"))

dec_line.line.color = "gray"

dec_boundary_ex = (0.5 - least_squares_model_ex.intercept_)/least_squares_model_ex.coef_[0]
dec_line_ex = go.Scatter(
    name="Decision Boundary", 
    x = [dec_boundary_ex, dec_boundary_ex], y=[-0.5,1.5], mode="lines",
    line=dict(color="black", dash="dash"))



py.iplot([mal_points, ben_points,model_line, model_line_ex, dec_line, dec_line_ex], filename="lr-07")

print("Before:", 
      zero_one_loss(Y_ex, least_squares_model.predict(X_ex) > 0.5))
print("After:", 
      zero_one_loss(Y_ex, least_squares_model_ex.predict(X_ex) > 0.5))

Before: 0.133489461358
After: 0.182669789227

The error got worse upon observing an extreme point that agrees with our trend!

Return to Slides¶

Empirical Probability¶

What does the empirical probability of being malignant for different sizes look like?

X_slices = np.linspace(8, 24, 10)
P_emp = np.array([
    np.mean(Y[np.squeeze(np.abs(X - c)) < 2.0]) for c in X_slices
])

p_emp_line = go.Scatter(name="Empirical Probabilities",
    x=X_slices, y=P_emp, 
    mode="lines", line=dict(color="green", width=8))
py.iplot([mal_points, ben_points, model_line, p_emp_line, dec_line], filename="lr-07")

Logistic Activation Function¶

The logistic function (sometimes called the sigmoid function) has the above shape and is commonly used to transform a real number into a value that can be used to encode a probability. The logistic function has some interesting mathematical properties but let's start with it's form:

$$\large \sigma(t) = \frac{1}{1 + e^{-t}} $$

def sigmoid(t):
    return 1. / (1. + np.exp(-t))

t = np.linspace(-5,5,50)
sigmoid_line = go.Scatter(name="Logistic Function",
    x=t, y=sigmoid(t), mode="lines")
py.iplot([sigmoid_line], filename="lr-08")

Return to slides¶

Gradient Descent¶

Recall the algorithm consists of iteratively computing:

$$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \rho(t) \nabla_\theta \left(- \log\mathcal{L}(\theta) \right) $$

where $\rho(t)$ (sometimes called the learning rate) is typically:

$$ \large \rho(t) = \frac{1}{t} $$

and for logistic regression the gradient of the negative log-likelihood is given by:

$$\large \nabla_\theta \left(- \log\mathcal{L}(\theta) \right) = \large \sum_{i=1}^n \left(\sigma\left(x_i^T \theta\right) - y_i \right) x_i $$

See the lecture slides for a derivation.

def gradient_descent(X, Y, theta0, gradient_function, max_iter = 1000000,  
                     epsilon=0.0001):
    theta = theta0
    for t in range(1, max_iter):
        rho = 1./t
        grad = gradient_function(theta, X, Y)
        theta = theta - rho * grad
        # Track Convergence
        if np.linalg.norm(grad) < epsilon:
            return theta
    return theta

Convergence: When to stop iterating¶

The gradient descent algorithm is typically run until one of the following conditions is met:

$\theta$ stops changing (by more than epsilon)
a maximum number of iterations are run (i.e., user gets impatient)
the gradient of the objective is zero (this is always a stopping condition, why?)

Defining $\nabla_\theta \left(- \log\mathcal{L}(\theta) \right)$¶

In the following we implement the gradient function for a dataset X and Y.

def neg_gradL(theta, X, Y):
    return -(X.T @ (Y - sigmoid(X @ theta)))

In the following we standardize this dataset for numerical stability. Note that due to the exponents extreme values can be an issue.

from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
standardizer.fit(X)
Xst = standardizer.transform(X)

We will also add a constant (bias or offset) term.

Phi = np.hstack([Xst,np.ones([len(Xst),1])])

Run the batch gradient descent solver

theta_batch = gradient_descent(Phi, Y, np.zeros(2), neg_gradL)
theta_batch

array([ 3.30903971, -0.63546791])

Examining the solution¶

In the following we plot the logistic regression probability model and the corresponding decision boundary. To compute the decision boundary recall that from the model definition that $\textbf{P}(Y = 1 | X) = \sigma(x^T \theta)$ and therefore the probability that $\textbf{P}(Y = 1 | X) > 0.5$ occurs when $\sigma(x^T \theta) > 0.5$ and therefore $x^T \theta > \sigma^{-1}(0.5) = 0$. Note however that we standardized the features in $X$ to construct $\Phi$ and therefore after solving for $X$ at the decision boundary we will need to apply the inverse standardization (multiply by the standard deviation and add back the mean).

Phi_plt = np.hstack([
    standardizer.transform(np.array([X_plt]).T), 
    np.ones((len(X_plt),1))])

# The logistic regression decision boundary is when the underlying linear
# model passes through zero
lr_dec_boundary = (0.0 - theta_batch[1])/theta_batch[0]
lr_dec_boundary = standardizer.inverse_transform([lr_dec_boundary])[0]
lr_dec_line = go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[-0.5,1.5], mode="lines",
                     line=dict(color="red", dash="dot"))

lr_line = go.Scatter(name="SGD Logistic Regression",
    x=X_plt, y=sigmoid(Phi_plt @ theta_batch), 
    mode="lines", line=dict(color="orange", width=4))
py.iplot([points, lr_line, dec_line, lr_dec_line], filename="lr-09")

Adding the decision boundaries to our earlier histograms:

plt = ff.create_distplot([
    data[data['malignant']]['mean radius'],
    data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))
py.iplot(plt, filename="lr-10")

Comparison in Accuracy¶

print("Least Squares Training Prediction Error:", 
      zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
print("Logistic Regression Prediction Error:", 
      zero_one_loss(Y, sigmoid(Phi @ theta_batch) > 0.5))

Least Squares Training Prediction Error: 0.133802816901
Logistic Regression Prediction Error: 0.129107981221

What does the loss surface look like?¶

Let's examine the loss function surface and our solution. Recall that the loss function is the negative of the log-likelihood (minimize loss = maximize likelihood) and has the following form: $$\large \log \mathcal{L}(\theta) = -\sum_{i=1}^n \left[ y_i \log\left( \sigma\left(x_i^T \theta \right) \right)+ \left(1-y_i \right)\log \left(1 - \sigma\left(x_i^T \theta\right) \right) \right] $$

We can implement this in the following python expression:

def lr_loss(theta, Phi, Y):
    p = sigmoid(Phi @ theta)
    return -np.sum((Y[:, np.newaxis] * np.log(p) + 
                    (1-Y[:, np.newaxis]) * np.log(1-p)), 
                   axis=0)

By evaluating the loss at a grid of points we can can construct the loss surface:

uvalues = np.linspace(2,8,70)
vvalues = np.linspace(-4,3,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
lr_loss_values = lr_loss(thetas, Phi, Y)
lr_loss_surface = go.Surface(name="Logistic Regression Loss",
        x=u, y=v, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
    )
optimal_batch_lr_point = go.Scatter3d(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]], 
        z = lr_loss(np.array([theta_batch]).T, Phi, Y),
        marker=dict(size=10, color="red")
    )
py.iplot(go.Figure(data=[lr_loss_surface, optimal_batch_lr_point]), filename="lr-11")

Visualizing the gradient descent path¶

In the following we plot the solution path taken by gradient descent.

However, to make the plot easy to visualize (solve in fewer more consistent iterations) I constrained the gradient descent procedure to take step's of at most length one. This prevents the solution from exploding in the first few iterations. For those who are interested in optimization techniques this is a formal of proximal gradient descent.

def batch_gd_thetas(X, Y, theta0, grad, max_iter = 1000000,  
             epsilon=0.01):
    theta = theta0
    thetas = [theta0]
    n = len(X)
    for t in range(1, max_iter):
        rho = 1./t
        g = grad(theta, X, Y)
        # To make the solution (and plot) more stable 
        # I have constrained the gradient step size to
        # the unit ball.
        if np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        thetas.append(theta)
        # Track Convergence
        if np.linalg.norm(g) < epsilon:
            return np.array(thetas)
    return np.array(thetas)

all_thetas = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), neg_gradL)

thata_points = go.Scatter(name="Theta Values", x=all_thetas[:,0], y=all_thetas[:,1],
                          mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values, (len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True
    )

optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_batch[0]], y = [theta_batch[1]],
        marker=dict(size=10, color="red")
    )

py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-12")

Stochastic Gradient Descent¶

In the following we implement stochastic gradient descent (SGD) to solve the logistic regression problem. Afterwards we use much more efficient and robust libraries (and you should do the same). However we implement SGD here for pedagogical value.

def sgd(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001):
    theta = theta0
    n = len(X)
    for t in range(1, max_iter):
        rho = 1./t
        i = t % n
        g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
        if np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        # Track Convergence
        if (t % 10000) == 0:
            if np.linalg.norm(grad(theta, X, Y)) < epsilon:
                return theta
    return theta

Computing the theta values:

theta_sgd = sgd(Phi, Y, np.array([2.5, 2.5]), neg_gradL)
theta_sgd

array([ 3.29127697, -0.71010776])

Visualizing the SGD Solution Path¶

Plotting the solution path

def sgd_thetas(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001):
    theta = theta0
    n = len(X)
    thetas = [theta]
    for t in range(1, max_iter):
        rho = 1/t
        i = t % n
        g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
        if np.linalg.norm(g) > 1:
            g = g / np.linalg.norm(g)
        theta = theta - rho * g
        thetas.append(theta)
    return np.array(thetas)


all_thetas = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), neg_gradL)

thata_points = go.Scatter(name="Theta Values", x=all_thetas[:,0], y=all_thetas[:,1],
        mode="lines+markers", showlegend=False
    )
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
        x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
        colorscale='Viridis', reversescale=True, 
    )
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
        x = [theta_sgd[0]], y = [theta_sgd[1]],
        marker=dict(size=10, color="red"), showlegend=False
    )
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")

What is going on?¶

Comparing the Solutions¶

plt = ff.create_distplot([
    data[data['malignant']]['mean radius'],
    data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))

lr_sgd_dec_boundary = (0.0 - theta_sgd[1])/theta_sgd[0]
lr_sgd_dec_boundary = standardizer.inverse_transform([lr_sgd_dec_boundary])[0]
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary", 
                      x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="blue", dash="dot")))
py.iplot(plt, filename="lr-14")

Return to Slides¶

Regularization and Separability¶

The built in Scikit learn logistic regression models use $L^2$ regularization by default (a good idea).

Why might regularization be necessary?
What happens if the data are separable: there is a plane that separates one class from another?

Xtoy = np.hstack([-np.random.rand(20) - 0.5, np.random.rand(20) + 0.5])
Ytoy = np.hstack([np.zeros(20), np.ones(20)])

toy_theta = gradient_descent(np.hstack([np.array([Xtoy]).T, np.ones([len(Xtoy),1])]), Ytoy, 
                             np.zeros(2), neg_gradL,
                             epsilon=0.00001)
toy_theta

array([  2.09158781e+01,  -4.86189917e-04])

What happens if the data are separable?

X_toy_plt = np.hstack([
    np.array([np.linspace(-2,2,100)]).T, 
    np.ones((100,1))])
py.iplot([
    go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="blue")),
    go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20), 
               mode="markers", marker=dict(opacity=0.5, color="red")),
    go.Scatter(name="SGD Logistic Regression",
    x=np.linspace(-2,2,100), y=sigmoid(X_toy_plt @ toy_theta), 
    mode="lines", line=dict(color="green", width=4))
], filename="lr-15")

Scikit-Learn Logistic Regression¶

We will use the built in cross-validation support for logistic regression in scikit-learn.

lr = linear_model.LogisticRegressionCV(100)
lr.fit(X,Y)

LogisticRegressionCV(Cs=100, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

Plotting the predicted probability¶

lr_line = go.Scatter(name="Scikit-LR",
    x=X_plt, y=lr.predict_proba(np.array([X_plt]).T).T[1], 
    mode="lines", line=dict(color="green", width=4))
py.iplot([points, lr_line], filename="lr-16")

plt = ff.create_distplot([
    data[data['malignant']]['mean radius'],
    data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'],  bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary", 
                      x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary", 
                      x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
                     line=dict(color="black", dash="dot")))
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary", 
                      x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="blue", dash="dot")))
lr_sk_dec_boundary = (0.0 - lr.intercept_)/lr.coef_[0]

plt.data.append(go.Scatter(name="SK Logistic Reg. Dec. Boundary", 
                      x = [lr_sk_dec_boundary,lr_sk_dec_boundary], 
                           y=[0,0.25], mode="lines",
                     line=dict(color="pink", dash="dot")))

py.iplot(plt, filename="lr-17")

LR Prediction accuracy¶

Comparing the prediction accuracy with least squares we see a bit of an improvement.

lr_errors = cross_val_score(linear_model.LogisticRegression(), X, Y, 
                scoring=make_scorer(zero_one_loss), cv=5)
print("LeastSquares Error:", np.mean(linreg_errors))
print("LR Error:", np.mean(lr_errors))

LeastSquares Error: 0.140845070423
LR Error: 0.129401993355

We can also examine the extreme point example from before:

lr_extreme = linear_model.LogisticRegression()
lr_extreme.fit(X_ex, Y_ex)

print("Least Squares Extreme Point:", 
      zero_one_loss(Y_ex, least_squares_model_ex.predict(X_ex) > 0.5))
print("LR Extreme Point:", 
      zero_one_loss(Y_ex, lr_extreme.predict(X_ex) > 0.5))

Least Squares Extreme Point: 0.182669789227
LR Extreme Point: 0.126463700234

Return to Slides¶

Multiclass Prediction & Feature Engineering¶

We temporarily return to a simple synthetic dataset

np.random.seed(42)
nk = 50
Xmc = np.vstack([
    np.random.randn(nk,2) + np.array([3,1]),
    np.random.randn(nk,2) + np.array([1,3]),
    0.5 * np.random.randn(nk,2) ,
    0.5 * np.random.randn(nk,2) + np.array([3,3])
])
Ymc = np.hstack([np.zeros(nk), np.ones(nk), 2*np.ones(nk),2*np.ones(nk)])

blue_points = go.Scatter(name = "A", x=Xmc[Ymc==0,0], y=Xmc[Ymc==0,1], 
               mode="markers", marker=dict(opacity=0.5, color="blue"))
red_points  = go.Scatter(name = "B", x=Xmc[Ymc==1,0], y=Xmc[Ymc==1,1], 
               mode="markers", marker=dict(opacity=0.5, color="red"))
orange_points = go.Scatter(name = "C", x=Xmc[Ymc==2,0], y=Xmc[Ymc==2,1], 
               mode="markers", marker=dict(opacity=0.5, color="orange"))



py.iplot([blue_points, red_points, orange_points], filename="lr-18")

Train a Soft-Max Classifier¶

Here we train a soft-max classifier. Currently scikit learn requires that we use an LBFGS solver for soft-max. L-BFGS is a quasi second order method that uses the both the gradient and a diagonal approximation of the second derivative (the Hessian) to solve for the optimal parameters.

lr = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr.fit(Xmc,Ymc)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Plotting the Decision Surface¶

In the following we plot the corresponding decision surface generated by the basic multiclass logistic regression model. Notice that the limitations of the linear decision surface.

uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.vstack((u.flatten(),v.flatten())).T
label = lr.predict(coords)
label_contour = go.Contour(
    name="Label ContourSurace",
    x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
    colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]), 
         filename="lr-19")

Feature Engineering with Gaussian RBFs¶

To capture the non-linear structure we randomly sample 20 data points and create an RBF feature centered at that point.

def gaussian_rbf(us, lam=1):
    return lambda x: np.array([np.exp(-np.linalg.norm(x - u)**2 / lam**2) for u in us])

num_basis = 20
np.random.seed(42)
rbf_features = gaussian_rbf(Xmc[np.random.choice(range(len(Xmc)), num_basis, replace=False),:])
Phimc = np.array([rbf_features(x) for x in Xmc])

lr_rbf = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_rbf.fit(Phimc,Ymc)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

Again fitting the logistic regression model we get a non-linear decision surface in our original space but a linear decision surface in a higher dimensional transformed space.

uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
label = lr_rbf.predict(coords)
label_contour = go.Contour(
    name="Label ContourSurace",
    x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
    colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]), 
         filename="lr-20")

uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
prb = lr_rbf.predict_proba(coords)

surfaces = [
    go.Surface(name=str(i),
        x=u, y=v, z=np.reshape(prb[:,i],(len(uvalues), len(vvalues))), 
               showscale = False
    )
    for i in range(3)
]

py.iplot(go.Figure(data=surfaces), filename="lr-21")

Logistic Regression¶

Real Data¶

Least Squares Regression¶

Preparing the data Train-Test Split¶

How is our fit?¶

Questions:¶

What is the Root Mean Squared Error?¶

Cross Validation¶

Questions?¶

Classification Error¶

Simple Decision Rule¶

Compute ZeroOneLoss¶

Guessing the Majority Class¶

Class Imbalance¶

Addressing Class Imbalance¶

Cross Validation of Zero-One Error¶

Pretty broad range?¶

Can we think of the line as a "probability"?¶

Could we just truncate the line?¶

Not Satisfied Yet?¶

Issue with Extreme Points¶

Return to Slides¶

Empirical Probability¶

Logistic Activation Function¶

Return to slides¶

Gradient Descent¶

Convergence: When to stop iterating¶

Defining $\nabla_\theta \left(- \log\mathcal{L}(\theta) \right)$¶

Examining the solution¶

Comparison in Accuracy¶

What does the loss surface look like?¶

Visualizing the gradient descent path¶

Stochastic Gradient Descent¶

Visualizing the SGD Solution Path¶

What is going on?¶

Comparing the Solutions¶

Return to Slides¶

Regularization and Separability¶

Scikit-Learn Logistic Regression¶

Plotting the predicted probability¶

LR Prediction accuracy¶

Return to Slides¶

Multiclass Prediction & Feature Engineering¶

Train a Soft-Max Classifier¶

Plotting the Decision Surface¶

Feature Engineering with Gaussian RBFs¶

Compute `ZeroOneLoss`¶