import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("talk")
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')
This is the notebook accompanies the lecture on Logistic Regression.
Notebook created by Joseph E. Gonzalez for DS100.
For this lecture we will use the famous Wisconsin Breast Cancer Dataset which we can obtain from scikit learn.
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0)
This is a clear example of over-plotting. We can improve the above plot by jittering the data:
jitter_y = data['malignant'] + 0.1*np.random.rand(len(data['malignant'])) -0.05
points = go.Scatter(x=data['mean radius'], y = jitter_y, mode="markers", marker=dict(opacity=0.5))
tumor_layout = dict(xaxis=dict(title="Mean Radius"),yaxis=dict(title="Malignant"))
py.iplot(go.Figure(data=[points], layout=tumor_layout))
Perhaps a better way to visualize the data is using stacked histograms.
py.iplot(ff.create_distplot([data.loc[~data['malignant'], 'mean radius'],
data.loc[data['malignant'], 'mean radius']],
group_labels=["Benign","Malignant"],
bin_size=0.5))
Always split your data into training and test groups.
from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.25, random_state=42)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))
We will define $X$ and $Y$ as variables containing the training features and labels.
X = data_tr[['mean radius']].values
Y = data_tr['malignant'].values.astype('float')
What does the empirical probability of being malignant for different sizes look like?
X_slices = np.linspace(8, 24, 10)
P_emp = np.array([
np.mean(Y[np.squeeze(np.abs(X - c)) < 2.0]) for c in X_slices
])
import sklearn.linear_model as linear_model
least_squares_model = linear_model.LinearRegression()
least_squares_model.fit(X,Y)
jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
ind_mal = least_squares_model.predict(X) > 0.5
X_plt = np.linspace(np.min(X), np.max(X), 1000)
model_line = go.Scatter(name="Least Squares",
x=X_plt, y=least_squares_model.predict(np.array([X_plt]).T),
mode="lines", line=dict(color="orange"))
mal_points = go.Scatter(name="Classified as Malignant",
x=np.squeeze(X[ind_mal]), y = jitter_y[ind_mal],
mode="markers", marker=dict(opacity=0.5, color="red"))
ben_points = go.Scatter(name="Classified as Benign",
x=np.squeeze(X[~ind_mal]), y = jitter_y[~ind_mal],
mode="markers", marker=dict(opacity=0.5, color="blue"))
dec_boundary = (0.5 - least_squares_model.intercept_)/least_squares_model.coef_[0]
dec_line = go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[-0.5,1.5], mode="lines",
line=dict(color="black", dash="dot"))
p_emp_line = go.Scatter(name="Empirical Probabilities",
x=X_slices, y=P_emp,
mode="lines", line=dict(color="green", width=8))
py.iplot(go.Figure(data=[mal_points, ben_points, model_line, p_emp_line, dec_line], layout=tumor_layout))
The logistic function (sometimes called the sigmoid function) has the above shape and is commonly used to transform a real number into a value that can be used to encode a probability. The logistic function has some interesting mathematical properties but let's start with it's form:
$$\large \sigma(t) = \frac{1}{1 + e^{-t}} $$
def sigmoid(t):
return 1. / (1. + np.exp(-t))
t = np.linspace(-5,5,50)
sigmoid_line = go.Scatter(name="Logistic Function",
x=t, y=sigmoid(t), mode="lines")
py.iplot([sigmoid_line])
The following plots $\sigma(w t)$ for different $w$ values:
t = np.linspace(-5,5,100)
lines = []
for w in np.linspace(0.5,10,20):
line = go.Scatter(name=str(w), x=t, y=sigmoid(w*t), mode="lines", visible=False)
lines.append(line)
lines[0].visible = True
steps = []
for i in range(len(lines)):
step = dict(
label = lines[i]['name'],
method = 'restyle',
args = ['visible', [False] * (len(lines)+1)],
)
step['args'][1][i] = True # Toggle i'th trace to "visible"
steps.append(step)
sliders = [dict(active = 0, currentvalue = {"prefix": "scale term: "}, steps = steps)]
# render the plot
slider_layout = go.Layout(xaxis=dict(range=[t.min(), t.max()]),
yaxis=dict(range=[-0.5 , 1.5]),
showlegend=False,
sliders=sliders)
py.iplot(go.Figure(data = lines, layout=slider_layout))
The following plots $\sigma(5t + b)$ for different $v$ values:
t = np.linspace(-5,5,100)
lines = []
for b in np.linspace(-10,10,20):
line = go.Scatter(name=str(w), x=t, y=sigmoid(5*t + b), mode="lines", visible=False)
lines.append(line)
lines[0].visible = True
steps = []
for i in range(len(lines)):
step = dict(
label = lines[i]['name'],
method = 'restyle',
args = ['visible', [False] * (len(lines)+1)],
)
step['args'][1][i] = True # Toggle i'th trace to "visible"
steps.append(step)
sliders = [dict(active = 0, currentvalue = {"prefix": "offset term: "}, steps = steps)]
# render the plot
slider_layout = go.Layout(xaxis=dict(range=[t.min(), t.max()]),
yaxis=dict(range=[-0.5 , 1.5]),
showlegend=False,
sliders=sliders)
py.iplot(go.Figure(data = lines, layout=slider_layout))
We have frequently used the squared loss. Below we plot the squared loss using the logistic regression model:
$$\Large L(\theta) = \frac{1}{n}\sum_{i=1}^n \left( y_i - \sigma\left( \phi(x_i)^T \theta\right) \right)^2 $$
Unfortunately the squared loss is not convex. We can see this by simply plotting:
ytmp = np.array([0, 0,1,0,1, 1])
xtmp = np.array([-4, -2,-0.5,1,3,5])
plt.plot(xtmp, ytmp, 'o')
plt.xlabel("X")
plt.ylabel("Y");
plt.savefig("toydata.pdf", bbox_inches="tight");
def l2loss(theta):
return ((ytmp - sigmoid(theta * xtmp))**2).mean()
thetas = np.linspace(-5,5,100)
losses = np.array([l2loss(t) for t in thetas])
plt.plot(thetas,losses)
plt.xlabel(r"$\theta$")
plt.ylabel(r"$L(\theta)$");
plt.savefig("squaredloss.pdf", bbox_inches="tight")
The Kullback–Leibler (KL) Divergence between two distribution is given by:
$$ \Large \textbf{D}(P || Q) = \sum_{k=1}^K P(k) \log \left( \frac{P(k)}{Q(k)} \right) $$
The following computes the KL divergence between two categorical distributions:
def kl_divergence(p, q):
return np.sum(p * np.log(p / q))
Evaluating two toy distributions
p = np.array([1,1,2,3,2,1,0,0,0,1,2,1,1,0,0]) + 0.1
p = p /p.sum()
q = np.array([0,0,1,1,3,2,2,5,2,1,1,1,0,1,1]) + 0.1
q = q /q.sum()
Visualizing the KL divergence:
plt.figure(figsize=(10,8))
plt.subplot(3,1,1)
classes = np.arange(len(p)) + 1
plt.bar(classes, p)
plt.ylabel("P")
plt.xticks(classes);
plt.subplot(3,1,2)
plt.bar(classes, q);
plt.ylabel("Q")
plt.xticks(classes);
plt.subplot(3,1,3)
plt.bar(classes, p * np.log(p/q), color='r')
plt.xticks(classes);
plt.ylabel(r"$p \, \log\left(\frac{p}{q}\right)$")
plt.savefig('pvq.pdf', bbox_inches='tight')
Notice that the KL divergence is larger when P is big and P does not match Q. Where P is small (unlikely events) Q does not need to match P.
The cross entropy loss can be obtained by minimizing the KL divergence between the observed conditional probabilities and the predicted conditional probabilities:
$$\Large \arg \min_\theta -\frac{1}{n} \sum_{i=1}^n \mathbf{P}\left(y_i = k \, | \, x_i\right)\log \left( \hat{\mathbf{P}}_\theta \left(y_i = k \, | \, x_i\right)\right) $$
For the Logistic binary classification model this reduces to:
$$\Large \arg \min_\theta -\frac{1}{n} \sum_{i=1}^n \left( y_i \phi(x_i)^T \theta + \log \left(\sigma\left(-\phi(x_i)^T \theta\right) \right) \right) $$
Which we implement below:
def cross_entropy(theta):
return np.mean(-ytmp * theta *xtmp - np.log(sigmoid(-theta * xtmp)))
Plotting this loss for the above toy example we get:
thetas = np.linspace(-5,5,100)
losses = np.array([cross_entropy(t) for t in thetas])
plt.plot(thetas,losses)
plt.xlabel(r"$\theta$")
plt.ylabel(r"$L(\theta)$");
plt.savefig("crossent.pdf", bbox_inches='tight')
There is no close form solution to minimize this loss. We therefore use gradient descent.
Recall the algorithm consists of iteratively computing:
$$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \rho(t) \left(\nabla_\theta \mathbf{L}(\theta) \biggr\rvert_{\theta = \theta^{(\tau)}}\right) $$
where $\rho(t)$ (sometimes called the learning rate) is typically:
$$ \large \rho(t) = \frac{1}{t} $$
The gradient of the loss is given by: $$\large \nabla_\theta \textbf{L}(\theta) = \frac{1}{n}\sum_{i=1}^n \left(\sigma\left(\phi(x_i)^T \theta\right) - y_i \right) \phi(x_i) $$
The logistic regression update equation is then: $$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \frac{1}{t} \left(\frac{1}{n}\sum_{i=1}^n \left(\sigma\left(\phi(x_i)^T \theta\right) - y_i \right) \phi(x_i)\right) $$
def gradient_descent(X, Y, theta0, gradient_function, max_iter = 1000000,
epsilon=0.0001):
theta = theta0
for t in range(1, max_iter):
rho = 1./t
grad = gradient_function(theta, X, Y)
theta = theta - rho * grad
# Track Convergence
if np.linalg.norm(grad) < epsilon:
return theta
return theta
The gradient descent algorithm is typically run until one of the following conditions is met:
In the following we implement the gradient function for a dataset X and Y.
def gradL(theta, X, Y):
return (X.T @ (sigmoid(X @ theta) - Y))
In the following we standardize this dataset for numerical stability. Note that due to the exponents extreme values can be an issue.
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
standardizer.fit(X)
Xst = standardizer.transform(X)
We will also add a constant (bias or offset) term.
Phi = np.hstack([Xst,np.ones([len(Xst),1])])
Run the batch gradient descent solver
theta_batch = gradient_descent(Phi, Y, np.ones(2), gradL)
theta_batch
In practice you will again use libraries with optimized implementations of these algorithms. However it is worth knowing how they work.
from sklearn.linear_model import LogisticRegression
lr_basic = LogisticRegression(fit_intercept=False, C=100.00)
# 1) We already added an intercept to phi
# 2) C = 1/lambda the inverse regularization parameter.
# by default SK learn adds regularization (we made it small)
lr_basic.fit(Phi, Y)
lr_basic.coef_
The solution is slightly different because the SKLearn logistic regression model automatically adds regularization. This ensures a more stable solution and enables more advanced optimization algorithms to perform reliably.
In the following we plot the logistic regression probability model and the corresponding decision boundary. To compute the decision boundary recall that from the model definition that $\textbf{P}(Y = 1 | X) = \sigma(x^T \theta)$ and therefore the probability that $\textbf{P}(Y = 1 | X) > 0.5$ occurs when $\sigma(x^T \theta) > 0.5$ and therefore $x^T \theta > \sigma^{-1}(0.5) = 0$. Note however that we standardized the features in $X$ to construct $\Phi$ and therefore after solving for $X$ at the decision boundary we will need to apply the inverse standardization (multiply by the standard deviation and add back the mean).
Phi_plt = np.hstack([
standardizer.transform(np.array([X_plt]).T),
np.ones((len(X_plt),1))])
# The logistic regression decision boundary is when the underlying linear
# model passes through zero
lr_dec_boundary = (0.0 - theta_batch[1])/theta_batch[0]
lr_dec_boundary = standardizer.inverse_transform([lr_dec_boundary])[0]
lr_dec_line = go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[-0.5,1.5], mode="lines",
line=dict(color="red", dash="dot"))
lr_line = go.Scatter(name="Gradeint Descent Logistic Regression",
x=X_plt, y=sigmoid(Phi_plt @ theta_batch),
mode="lines", line=dict(color="orange", width=4))
py.iplot(go.Figure(data=[points, lr_line, dec_line, lr_dec_line], layout=tumor_layout))
Adding the decision boundaries to our earlier histograms:
plt = ff.create_distplot([
data[~data['malignant']]['mean radius'],
data[data['malignant']]['mean radius']
], group_labels=['benign', 'malignant'], bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="black", dash="dot")))
py.iplot(plt)
from sklearn.metrics import zero_one_loss
print("Least Squares Training Prediction Error:",
zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
print("Logistic Regression Prediction Error:",
zero_one_loss(Y, sigmoid(Phi @ theta_batch) > 0.5))
Let's examine the loss function surface and our solution. Recall that the loss function is the negative of the log-likelihood (minimize loss = maximize likelihood) and has the following form:
$$\Large L(\theta) = -\frac{1}{n} \sum_{i=1}^n \left( y_i \phi(x_i)^T \theta + \log \left(\sigma\left(-\phi(x_i)^T \theta\right) \right) \right) $$
We can implement this in the following python expression:
def lr_loss(theta, Phi, Y):
t = Phi @ theta
return -np.mean(Y * t - np.log(1 + np.exp(t)), axis=0)
By evaluating the loss at a grid of points we can can construct the loss surface:
uvalues = np.linspace(1,8,70)
vvalues = np.linspace(-5,5,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
lr_loss_values = np.array([lr_loss(t, Phi, Y) for t in thetas.T])
lr_loss_surface = go.Surface(name="Logistic Regression Loss",
x=u, y=v, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
)
optimal_batch_lr_point = go.Scatter3d(name = "Optimal Point",
x = [theta_batch[0]], y = [theta_batch[1]],
z = [lr_loss(theta_batch, Phi, Y)],
marker=dict(size=10, color="red")
)
py.iplot(go.Figure(data=[lr_loss_surface, optimal_batch_lr_point]))
In the following we plot the solution path taken by gradient descent.
However, to make the plot easy to visualize (solve in fewer more consistent iterations) I introduced gradient clipping. Here I constrain the gradient norm to ensure that it is never larger than one. This prevents the solution from exploding in the first few iterations. For those who are interested in optimization techniques this is a form of proximal gradient descent.
def batch_gd_thetas(X, Y, theta0, grad, max_iter = 1000000,
epsilon=0.01, clipping=True, rho0 = 1.):
theta = theta0
thetas = [theta0]
n = len(X)
for t in range(1, max_iter):
rho = rho0 / t
g = grad(theta, X, Y)
# To make the solution (and plot) more stable
# I have constrained the gradient step size to
# the unit ball. This is sometimes called
# gradient clipping.
if clipping and np.linalg.norm(g) > 1:
g = g / np.linalg.norm(g)
theta = theta - rho * g
thetas.append(theta)
# Track Convergence
if np.linalg.norm(g) < epsilon:
return np.array(thetas)
return np.array(thetas)
theta_path = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), gradL)
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
x=uvalues, y=vvalues, z=np.reshape(np.log(lr_loss_values), (len(uvalues), len(vvalues))),
colorscale='Viridis', reversescale=True
)
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
x = [theta_batch[0]], y = [theta_batch[1]],
marker=dict(size=10, color="red")
)
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]))
theta_path = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), gradL, rho0=5.0)
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
x=uvalues, y=vvalues, z=np.reshape(np.log(lr_loss_values), (len(uvalues), len(vvalues))),
colorscale='Viridis', reversescale=True
)
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
x = [theta_batch[0]], y = [theta_batch[1]],
marker=dict(size=10, color="red")
)
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point],
layout=dict()))
In the following we implement stochastic gradient descent (SGD) to solve the logistic regression problem. Afterwards we use much more efficient and robust libraries (and you should do the same). However we implement SGD here for pedagogical value.
def sgd(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001):
theta = theta0
n = len(X)
for t in range(1, max_iter):
rho = 1./t
i = t % n
g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
if np.linalg.norm(g) > 1:
g = g / np.linalg.norm(g)
theta = theta - rho * g
# Track Convergence
if (t % 10000) == 0:
if np.linalg.norm(grad(theta, X, Y)) < epsilon:
return theta
return theta
Computing the theta values:
theta_sgd = sgd(Phi, Y, np.array([2.5, 2.5]), gradL)
theta_sgd
Plotting the solution path
def sgd_thetas(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001,
clipping=True, rho0 = 1.):
theta = theta0
n = len(X)
thetas = [theta]
for t in range(1, max_iter):
rho = rho0/t
i = t % n
g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
if clipping and np.linalg.norm(g) > 1:
g = g / np.linalg.norm(g)
theta = theta - rho * g
thetas.append(theta)
return np.array(thetas)
theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL)
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
mode="lines+markers", showlegend=False
)
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
colorscale='Viridis', reversescale=True,
)
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
x = [theta_sgd[0]], y = [theta_sgd[1]],
marker=dict(size=10, color="red"), showlegend=False
)
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")
theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL, rho0=5.)
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
mode="lines+markers", showlegend=False
)
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
colorscale='Viridis', reversescale=True,
)
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
x = [theta_sgd[0]], y = [theta_sgd[1]],
marker=dict(size=10, color="red"), showlegend=False
)
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")
theta_path = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), gradL, rho0=5., batch_size=100)
thata_points = go.Scatter(name="Theta Values", x=theta_path[:,0], y=theta_path[:,1],
mode="lines+markers", showlegend=False
)
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
colorscale='Viridis', reversescale=True,
)
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
x = [theta_sgd[0]], y = [theta_sgd[1]],
marker=dict(size=10, color="red"), showlegend=False
)
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")
plt = ff.create_distplot([
data[~data['malignant']]['mean radius'],
data[data['malignant']]['mean radius']
], group_labels=['benign','malignant'], bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="black", dash="dot")))
lr_sgd_dec_boundary = (0.0 - theta_sgd[1])/theta_sgd[0]
lr_sgd_dec_boundary = standardizer.inverse_transform([lr_sgd_dec_boundary])[0]
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary",
x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary],
y=[0,0.25], mode="lines",
line=dict(color="blue", dash="dot")))
py.iplot(plt, filename="lr-14")
The built in Scikit learn logistic regression models use $L^2$ regularization by default (a good idea).
Xtoy = np.hstack([-np.random.rand(20) - 0.5, np.random.rand(20) + 0.5])
Ytoy = np.hstack([np.zeros(20), np.ones(20)])
Phi_toy = np.hstack([np.array([Xtoy]).T, np.ones([len(Xtoy),1])])
toy_theta = gradient_descent(Phi_toy, Ytoy,
np.array([1,-1]), gradL,
epsilon=0.0001)
toy_theta
What happens if the data are separable?
X_toy_plt = np.hstack([
np.array([np.linspace(-2,2,100)]).T,
np.ones((100,1))])
py.iplot([
go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20),
mode="markers", marker=dict(opacity=0.5, color="blue")),
go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20),
mode="markers", marker=dict(opacity=0.5, color="red")),
go.Scatter(name="Logistic Regression",
x=np.linspace(-2,2,100), y=sigmoid(X_toy_plt @ toy_theta),
mode="lines", line=dict(color="green", width=4))
], filename="lr-15")
bigger_toy_theta = toy_theta*10
X_toy_plt = np.hstack([
np.array([np.linspace(-2,2,200)]).T,
np.ones((200,1))])
py.iplot([
go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20),
mode="markers", marker=dict(opacity=0.5, color="blue")),
go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20),
mode="markers", marker=dict(opacity=0.5, color="red")),
go.Scatter(name="Logistic Regression",
x=np.linspace(-2,2,200), y=sigmoid(X_toy_plt @ toy_theta),
mode="lines", line=dict(color="green", width=4)),
go.Scatter(name="Logistic Regression Bigger Theta",
x=np.linspace(-2,2,200), y=sigmoid(X_toy_plt @ bigger_toy_theta),
mode="lines", line=dict(color="red", width=4))
], filename="lr-15")
lr_loss(bigger_toy_theta, Phi_toy, Ytoy)
We will use the built in cross-validation support for logistic regression in scikit-learn.
lr = linear_model.LogisticRegressionCV(100)
lr.fit(X,Y)
lr_line = go.Scatter(name="Scikit-LR",
x=X_plt, y=lr.predict_proba(np.array([X_plt]).T).T[1],
mode="lines", line=dict(color="green", width=4))
py.iplot([points, lr_line], filename="lr-16")
plt = ff.create_distplot([
data[~data['malignant']]['mean radius'],
data[data['malignant']]['mean radius']
], group_labels=['benign', 'malignant'], bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="black", dash="dot")))
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary",
x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary],
y=[0,0.25], mode="lines",
line=dict(color="blue", dash="dot")))
lr_sk_dec_boundary = (0.0 - lr.intercept_[0])/lr.coef_[0,0]
plt.data.append(go.Scatter(name="SK Logistic Reg. Dec. Boundary",
x = [lr_sk_dec_boundary,lr_sk_dec_boundary],
y=[0,0.25], mode="lines",
line=dict(color="pink", dash="dot")))
py.iplot(plt)
Comparing the prediction accuracy with least squares we see a bit of an improvement.
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
lr_errors = cross_val_score(linear_model.LogisticRegression(), X, Y,
scoring=make_scorer(zero_one_loss), cv=5)
print("LR Error:", np.mean(lr_errors))
Compared to other models:
from sklearn.metrics import zero_one_loss
print("Least Squares Error:",
zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
print("Always predict 0:", zero_one_loss(Y, np.zeros(Y.shape)))
We temporarily return to a simple synthetic dataset
np.random.seed(42)
nk = 50
Xmc = np.vstack([
np.random.randn(nk,2) + np.array([3,1]),
np.random.randn(nk,2) + np.array([1,3]),
0.5 * np.random.randn(nk,2) ,
0.5 * np.random.randn(nk,2) + np.array([3,3])
])
Ymc = np.hstack([np.zeros(nk), np.ones(nk), 2*np.ones(nk),2*np.ones(nk)])
blue_points = go.Scatter(name = "A", x=Xmc[Ymc==0,0], y=Xmc[Ymc==0,1],
mode="markers", marker=dict(opacity=0.5, color="blue"))
red_points = go.Scatter(name = "B", x=Xmc[Ymc==1,0], y=Xmc[Ymc==1,1],
mode="markers", marker=dict(opacity=0.5, color="red"))
orange_points = go.Scatter(name = "C", x=Xmc[Ymc==2,0], y=Xmc[Ymc==2,1],
mode="markers", marker=dict(opacity=0.5, color="orange"))
py.iplot([blue_points, red_points, orange_points], filename="lr-18")
Here we train a soft-max classifier. Currently scikit learn requires that we use an LBFGS solver for soft-max. L-BFGS is a quasi second order method that uses the both the gradient and a diagonal approximation of the second derivative (the Hessian) to solve for the optimal parameters.
lr = linear_model.LogisticRegression(multi_class="multinomial", solver="lbfgs")
lr.fit(Xmc,Ymc)
In the following we plot the corresponding decision surface generated by the basic multiclass logistic regression model. Notice that the limitations of the linear decision surface.
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.vstack((u.flatten(),v.flatten())).T
label = lr.predict(coords)
label_contour = go.Contour(
name="Label ContourSurace",
x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]))
To capture the non-linear structure we randomly sample 20 data points and create an RBF feature centered at that point.
def gaussian_rbf(us, lam=1):
return lambda x: np.array([np.exp(-np.linalg.norm(x - u)**2 / lam**2) for u in us])
num_basis = 20
np.random.seed(42)
rbf_features = gaussian_rbf(Xmc[np.random.choice(range(len(Xmc)), num_basis, replace=False),:])
Phimc = np.array([rbf_features(x) for x in Xmc])
lr_rbf = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_rbf.fit(Phimc,Ymc)
Again fitting the logistic regression model we get a non-linear decision surface in our original space but a linear decision surface in a higher dimensional transformed space.
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
label = lr_rbf.predict(coords)
label_contour = go.Contour(
name="Label ContourSurace",
x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]),
filename="lr-20")
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
prb = lr_rbf.predict_proba(coords)
from plotly import tools
fig = tools.make_subplots(rows=1, cols=3,
specs=[[{'is_3d': True}, {'is_3d': True}, {'is_3d': True}]])
for i in range(3):
surf = go.Surface(name="Class " + str(i+1),
x=u, y=v, z=np.reshape(prb[:,i],(len(uvalues), len(vvalues))),
showscale = False)
fig.append_trace(surf, 1, i+1)
# fig['layout'].update(height=600, width=600, title='i <3 subplots')
py.iplot(fig)