import numpy as np
import pandas as pd
# Plotly plotting support
import plotly.plotly as py
# import plotly.offline as py
# py.init_notebook_mode()
# import cufflinks as cf
# cf.go_offline() # required to use plotly offline (no account required).
import plotly.graph_objs as go
import plotly.figure_factory as ff
# Make the notebook deterministic
np.random.seed(42)
This is the notebook accompanies the lecture on Logistic Regression.
Notebook created by Joseph E. Gonzalez for DS100.
For this lecture we will use the famous Wisconsin Breast Cancer Dataset which we can obtain from scikit learn.
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0)
data.columns
points = go.Scatter(x=data['mean radius'], y = 1.*data['malignant'], mode="markers")
py.iplot([points], filename="lr-01")
This is a clear example of over-plotting. We can improve the above plot by jittering the data:
jitter_y = data['malignant'] + 0.1*np.random.rand(len(data['malignant'])) -0.05
points = go.Scatter(x=data['mean radius'], y = jitter_y, mode="markers", marker=dict(opacity=0.5))
py.iplot([points], filename="lr-02")
We can also get a picture of the data distribution:
py.iplot(ff.create_distplot([
data[data['malignant']]['mean radius'],
data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'], bin_size=0.5), filename="lr-03")
Question: Looking at the above histograms could you describe a rule to predict whether or a cell is malignant?
"I suppose it is tempting, if the only tool you have is a hammer, to treat everything as if it were a nail." - Abraham Maslow The Psychology of Science
Goal: We would like to predict whether the tumor is malignant from the size of the tumor.
Always split your data into training and test groups.
from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.25, random_state=42)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))
We will define $X$ and $Y$ as variables containing the training features and labels.
X = np.array([data_tr['mean radius']]).T
Y = data_tr['malignant'].values.astype('float')
Fit a least squares regression model.
import sklearn.linear_model as linear_model
least_squares_model = linear_model.LinearRegression()
least_squares_model.fit(X,Y)
jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
points = go.Scatter(name="Jittered Data",
x=np.squeeze(X), y = jitter_y,
mode="markers", marker=dict(opacity=0.5))
X_plt = np.linspace(np.min(X), np.max(X), 10)
model_line = go.Scatter(name="Least Squares",
x=X_plt, y=least_squares_model.predict(np.array([X_plt]).T),
mode="lines", line=dict(color="orange"))
py.iplot([points, model_line], filename="lr-04")
from sklearn.metrics import mean_squared_error as mse
print("Training RMSE:", np.sqrt(mse(Y, least_squares_model.predict(X))))
Maybe we should also look at the cross validated error.
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
rmse_scores = np.sqrt(
cross_val_score(linear_model.LinearRegression(), X, Y,
scoring=make_scorer(mse), cv=5))
print("Min Validation Error: ", np.min(rmse_scores))
print("Median Validation Error:", np.median(rmse_scores))
print("Max Validation Error: ", np.max(rmse_scores))
This is a classification problem so we probably want to measure how often we predict the correct value. This is sometimes called the zero-one loss (or error):
$$ \large \textbf{ZeroOneLoss} = \frac{1}{n} \sum_{i=1}^n \textbf{I}\left[ y_i == f_\theta(x) \right] $$However to use the classification error we need to define a decision rule that maps $f_\theta(x)$ to the $\{0,1\}$ classification values.
Suppose we instituted the following simple decision rule:
This simple decision rule is deciding that a tumor is malignant if our model predicts a values above 0.5 (closer to 1 than zero).
In the following we plot the implication of these decisions on our training data.
jitter_y = Y + 0.1*np.random.rand(len(Y)) - 0.05
ind_mal = least_squares_model.predict(X) > 0.5
mal_points = go.Scatter(name="Classified as Malignant",
x=np.squeeze(X[ind_mal]), y = jitter_y[ind_mal],
mode="markers", marker=dict(opacity=0.5, color="red"))
ben_points = go.Scatter(name="Classified as Benign",
x=np.squeeze(X[~ind_mal]), y = jitter_y[~ind_mal],
mode="markers", marker=dict(opacity=0.5, color="blue"))
dec_boundary = (0.5 - least_squares_model.intercept_)/least_squares_model.coef_[0]
dec_line = go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[-0.5,1.5], mode="lines",
line=dict(color="black", dash="dot"))
py.iplot([mal_points, ben_points, model_line,dec_line], filename="lr-05")
ZeroOneLoss
¶from sklearn.metrics import zero_one_loss
print("Training Fraction incorrect:",
zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
Questions
This is the simplest baseline we could imagine and one you should always compare against. Let's start by asking what is the majority class
print("Fraction of Malignant Samples:", np.mean(Y))
Therefore if we guess the majority class benign we would get what accuracy?
# You can figure this out from the above number
# print("Guess Majority:", zero_one_loss(Y, np.zeros(len(Y))))
This is standard example of a common problem in classification (and perhaps modern society): class imbalance.
Class imbalance is when a disproportionate fraction of the samples are in one class (in this case benign). In extreme cases (e.g., fraud detection) only tiny fraction of the training data may contain examples in particular class. In these settings we can achieve very high-accuracies by always predicting the frequent class without learning a good classifier for the rare classes.
There are many techniques for managing class imbalance here are a few:
In this example the class imbalance is not that extreme so we will continue without re-sampling.
from sklearn.model_selection import KFold
kfold = KFold(3,shuffle=True, random_state=42)
linreg_errors = []
models = []
for tr_ind, te_ind in kfold.split(X):
model = linear_model.LinearRegression()
model.fit(X[tr_ind,], Y[tr_ind])
models.append(model)
linreg_errors.append(zero_one_loss(Y[te_ind], model.predict(X[te_ind,]) > 0.5))
print("Min Validation Error: ", np.min(linreg_errors))
print("Median Validation Error:", np.median(linreg_errors))
print("Max Validation Error: ", np.max(linreg_errors))
Not very robust ...
# dec_lines = [
# go.Scatter(name="Decision Boundary",
# x = [(0.5 - m.intercept_)/m.coef_[0]]*2,
# y=[-0.5,1.5], mode="lines",
# line=dict(dash="dot"))
# for m in models]
# X_plt = np.linspace(np.min(X), np.max(X), 10)
# model_lines = [
# go.Scatter(name="Least Squares " + str(zero_one_loss(Y, m.predict(X) > 0.5)),
# x=X_plt, y=m.predict(np.array([X_plt]).T),
# mode="lines")
# for m in models]
# py.iplot([points] + model_lines + dec_lines)
Not really. Probabilities are constrained between 0 and 1. How could we learn a model that captures this probabilistic interpretation?
Maybe we can define the probability as:
$$ \large p_i = \min\left(\max \left( x^T \theta , 0 \right), 1\right) $$this would look like:
def bound01(z):
u = np.where(z > 1, 1, z)
return np.where(u < 0, 0, u)
X_plt = np.linspace(np.min(X), np.max(X), 100)
p_line = go.Scatter(name="Truncated Least Squares",
x=X_plt, y=bound01(least_squares_model.predict(np.array([X_plt]).T)),
mode="lines", line=dict(color="green", width=8))
py.iplot([mal_points, ben_points, model_line, p_line, dec_line], filename="lr-06")
So far least squares regression seems pretty reasonable and we can "force" the model values to be bounded between 0 and 1 (a must for probabilities).
Can we interpret the truncated values as probabilities perhaps but it would depend on how the model is estimated (more on this soon).
It seems like large tumor sizes are indicative of malignant tumors. Suppose we measured a very large tumor at 100mm that is also malignant. What would this do to our model?
Let's add an extra data point and see what happens:
X_ex = np.vstack([X, [100]])
Y_ex = np.hstack([Y, 1.])
least_squares_model_ex = linear_model.LinearRegression()
least_squares_model_ex.fit(X_ex, Y_ex)
X_plt = np.linspace(np.min(X), np.max(X), 100)
extreme_point = go.Scatter(
name="Extreme Point", x=[100], y=[1], mode="markers",
marker=dict(color="black", size=10))
model_line.line.color = "gray"
model_line_ex = go.Scatter(name="New Least Squares",
x=X_plt, y=least_squares_model_ex.predict(np.array([X_plt]).T),
mode="lines", line=dict(color="orange"))
dec_line.line.color = "gray"
dec_boundary_ex = (0.5 - least_squares_model_ex.intercept_)/least_squares_model_ex.coef_[0]
dec_line_ex = go.Scatter(
name="Decision Boundary",
x = [dec_boundary_ex, dec_boundary_ex], y=[-0.5,1.5], mode="lines",
line=dict(color="black", dash="dash"))
py.iplot([mal_points, ben_points,model_line, model_line_ex, dec_line, dec_line_ex], filename="lr-07")
print("Before:",
zero_one_loss(Y_ex, least_squares_model.predict(X_ex) > 0.5))
print("After:",
zero_one_loss(Y_ex, least_squares_model_ex.predict(X_ex) > 0.5))
What does the empirical probability of being malignant for different sizes look like?
X_slices = np.linspace(8, 24, 10)
P_emp = np.array([
np.mean(Y[np.squeeze(np.abs(X - c)) < 2.0]) for c in X_slices
])
p_emp_line = go.Scatter(name="Empirical Probabilities",
x=X_slices, y=P_emp,
mode="lines", line=dict(color="green", width=8))
py.iplot([mal_points, ben_points, model_line, p_emp_line, dec_line], filename="lr-07")
The logistic function (sometimes called the sigmoid function) has the above shape and is commonly used to transform a real number into a value that can be used to encode a probability. The logistic function has some interesting mathematical properties but let's start with it's form:
$$\large \sigma(t) = \frac{1}{1 + e^{-t}} $$def sigmoid(t):
return 1. / (1. + np.exp(-t))
t = np.linspace(-5,5,50)
sigmoid_line = go.Scatter(name="Logistic Function",
x=t, y=sigmoid(t), mode="lines")
py.iplot([sigmoid_line], filename="lr-08")
Recall the algorithm consists of iteratively computing:
$$\large \theta^{(t+1)} \leftarrow \theta^{(t)} - \rho(t) \nabla_\theta \left(- \log\mathcal{L}(\theta) \right) $$where $\rho(t)$ (sometimes called the learning rate) is typically:
$$ \large \rho(t) = \frac{1}{t} $$and for logistic regression the gradient of the negative log-likelihood is given by:
$$\large \nabla_\theta \left(- \log\mathcal{L}(\theta) \right) = \large \sum_{i=1}^n \left(\sigma\left(x_i^T \theta\right) - y_i \right) x_i $$See the lecture slides for a derivation.
def gradient_descent(X, Y, theta0, gradient_function, max_iter = 1000000,
epsilon=0.0001):
theta = theta0
for t in range(1, max_iter):
rho = 1./t
grad = gradient_function(theta, X, Y)
theta = theta - rho * grad
# Track Convergence
if np.linalg.norm(grad) < epsilon:
return theta
return theta
The gradient descent algorithm is typically run until one of the following conditions is met:
In the following we implement the gradient function for a dataset X and Y.
def neg_gradL(theta, X, Y):
return -(X.T @ (Y - sigmoid(X @ theta)))
In the following we standardize this dataset for numerical stability. Note that due to the exponents extreme values can be an issue.
from sklearn.preprocessing import StandardScaler
standardizer = StandardScaler()
standardizer.fit(X)
Xst = standardizer.transform(X)
We will also add a constant (bias or offset) term.
Phi = np.hstack([Xst,np.ones([len(Xst),1])])
Run the batch gradient descent solver
theta_batch = gradient_descent(Phi, Y, np.zeros(2), neg_gradL)
theta_batch
In the following we plot the logistic regression probability model and the corresponding decision boundary. To compute the decision boundary recall that from the model definition that $\textbf{P}(Y = 1 | X) = \sigma(x^T \theta)$ and therefore the probability that $\textbf{P}(Y = 1 | X) > 0.5$ occurs when $\sigma(x^T \theta) > 0.5$ and therefore $x^T \theta > \sigma^{-1}(0.5) = 0$. Note however that we standardized the features in $X$ to construct $\Phi$ and therefore after solving for $X$ at the decision boundary we will need to apply the inverse standardization (multiply by the standard deviation and add back the mean).
Phi_plt = np.hstack([
standardizer.transform(np.array([X_plt]).T),
np.ones((len(X_plt),1))])
# The logistic regression decision boundary is when the underlying linear
# model passes through zero
lr_dec_boundary = (0.0 - theta_batch[1])/theta_batch[0]
lr_dec_boundary = standardizer.inverse_transform([lr_dec_boundary])[0]
lr_dec_line = go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[-0.5,1.5], mode="lines",
line=dict(color="red", dash="dot"))
lr_line = go.Scatter(name="SGD Logistic Regression",
x=X_plt, y=sigmoid(Phi_plt @ theta_batch),
mode="lines", line=dict(color="orange", width=4))
py.iplot([points, lr_line, dec_line, lr_dec_line], filename="lr-09")
Adding the decision boundaries to our earlier histograms:
plt = ff.create_distplot([
data[data['malignant']]['mean radius'],
data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'], bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="black", dash="dot")))
py.iplot(plt, filename="lr-10")
print("Least Squares Training Prediction Error:",
zero_one_loss(Y, least_squares_model.predict(X) > 0.5))
print("Logistic Regression Prediction Error:",
zero_one_loss(Y, sigmoid(Phi @ theta_batch) > 0.5))
Let's examine the loss function surface and our solution. Recall that the loss function is the negative of the log-likelihood (minimize loss = maximize likelihood) and has the following form: $$\large \log \mathcal{L}(\theta) = -\sum_{i=1}^n \left[ y_i \log\left( \sigma\left(x_i^T \theta \right) \right)+ \left(1-y_i \right)\log \left(1 - \sigma\left(x_i^T \theta\right) \right) \right] $$
We can implement this in the following python expression:
def lr_loss(theta, Phi, Y):
p = sigmoid(Phi @ theta)
return -np.sum((Y[:, np.newaxis] * np.log(p) +
(1-Y[:, np.newaxis]) * np.log(1-p)),
axis=0)
By evaluating the loss at a grid of points we can can construct the loss surface:
uvalues = np.linspace(2,8,70)
vvalues = np.linspace(-4,3,70)
(u,v) = np.meshgrid(uvalues, vvalues)
thetas = np.vstack((u.flatten(),v.flatten()))
lr_loss_values = lr_loss(thetas, Phi, Y)
lr_loss_surface = go.Surface(name="Logistic Regression Loss",
x=u, y=v, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
contours=dict(z=dict(show=True, color="gray", project=dict(z=True)))
)
optimal_batch_lr_point = go.Scatter3d(name = "Optimal Point",
x = [theta_batch[0]], y = [theta_batch[1]],
z = lr_loss(np.array([theta_batch]).T, Phi, Y),
marker=dict(size=10, color="red")
)
py.iplot(go.Figure(data=[lr_loss_surface, optimal_batch_lr_point]), filename="lr-11")
In the following we plot the solution path taken by gradient descent.
However, to make the plot easy to visualize (solve in fewer more consistent iterations) I constrained the gradient descent procedure to take step's of at most length one. This prevents the solution from exploding in the first few iterations. For those who are interested in optimization techniques this is a formal of proximal gradient descent.
def batch_gd_thetas(X, Y, theta0, grad, max_iter = 1000000,
epsilon=0.01):
theta = theta0
thetas = [theta0]
n = len(X)
for t in range(1, max_iter):
rho = 1./t
g = grad(theta, X, Y)
# To make the solution (and plot) more stable
# I have constrained the gradient step size to
# the unit ball.
if np.linalg.norm(g) > 1:
g = g / np.linalg.norm(g)
theta = theta - rho * g
thetas.append(theta)
# Track Convergence
if np.linalg.norm(g) < epsilon:
return np.array(thetas)
return np.array(thetas)
all_thetas = batch_gd_thetas(Phi, Y, np.array([3, 2.5]), neg_gradL)
thata_points = go.Scatter(name="Theta Values", x=all_thetas[:,0], y=all_thetas[:,1],
mode="lines+markers")
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
x=uvalues, y=vvalues, z=np.reshape(lr_loss_values, (len(uvalues), len(vvalues))),
colorscale='Viridis', reversescale=True
)
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
x = [theta_batch[0]], y = [theta_batch[1]],
marker=dict(size=10, color="red")
)
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-12")
In the following we implement stochastic gradient descent (SGD) to solve the logistic regression problem. Afterwards we use much more efficient and robust libraries (and you should do the same). However we implement SGD here for pedagogical value.
def sgd(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001):
theta = theta0
n = len(X)
for t in range(1, max_iter):
rho = 1./t
i = t % n
g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
if np.linalg.norm(g) > 1:
g = g / np.linalg.norm(g)
theta = theta - rho * g
# Track Convergence
if (t % 10000) == 0:
if np.linalg.norm(grad(theta, X, Y)) < epsilon:
return theta
return theta
Computing the theta values:
theta_sgd = sgd(Phi, Y, np.array([2.5, 2.5]), neg_gradL)
theta_sgd
Plotting the solution path
def sgd_thetas(X,Y, theta0, grad, max_iter = 5000, batch_size=50, epsilon=0.0001):
theta = theta0
n = len(X)
thetas = [theta]
for t in range(1, max_iter):
rho = 1/t
i = t % n
g = n/batch_size * grad(theta, X[i:(i+batch_size),], Y[i:(i+batch_size)])
if np.linalg.norm(g) > 1:
g = g / np.linalg.norm(g)
theta = theta - rho * g
thetas.append(theta)
return np.array(thetas)
all_thetas = sgd_thetas(Phi, Y, np.array([2.5, 2.5]), neg_gradL)
thata_points = go.Scatter(name="Theta Values", x=all_thetas[:,0], y=all_thetas[:,1],
mode="lines+markers", showlegend=False
)
lr_loss_contours = go.Contour(name="Logistic Regression Loss",
x=uvalues, y=vvalues, z=np.reshape(lr_loss_values,(len(uvalues), len(vvalues))),
colorscale='Viridis', reversescale=True,
)
optimal_batch_lr_point = go.Scatter(name = "Optimal Point",
x = [theta_sgd[0]], y = [theta_sgd[1]],
marker=dict(size=10, color="red"), showlegend=False
)
py.iplot(go.Figure(data=[lr_loss_contours, thata_points, optimal_batch_lr_point]), filename="lr-13")
plt = ff.create_distplot([
data[data['malignant']]['mean radius'],
data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'], bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="black", dash="dot")))
lr_sgd_dec_boundary = (0.0 - theta_sgd[1])/theta_sgd[0]
lr_sgd_dec_boundary = standardizer.inverse_transform([lr_sgd_dec_boundary])[0]
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary",
x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary],
y=[0,0.25], mode="lines",
line=dict(color="blue", dash="dot")))
py.iplot(plt, filename="lr-14")
The built in Scikit learn logistic regression models use $L^2$ regularization by default (a good idea).
Xtoy = np.hstack([-np.random.rand(20) - 0.5, np.random.rand(20) + 0.5])
Ytoy = np.hstack([np.zeros(20), np.ones(20)])
toy_theta = gradient_descent(np.hstack([np.array([Xtoy]).T, np.ones([len(Xtoy),1])]), Ytoy,
np.zeros(2), neg_gradL,
epsilon=0.00001)
toy_theta
What happens if the data are separable?
X_toy_plt = np.hstack([
np.array([np.linspace(-2,2,100)]).T,
np.ones((100,1))])
py.iplot([
go.Scatter(name = "Zeros", x=Xtoy[0:20], y=Ytoy[0:20] + 0.01 * np.random.randn(20),
mode="markers", marker=dict(opacity=0.5, color="blue")),
go.Scatter(name ="Ones", x=Xtoy[20:], y=Ytoy[20:] + 0.01 * np.random.randn(20),
mode="markers", marker=dict(opacity=0.5, color="red")),
go.Scatter(name="SGD Logistic Regression",
x=np.linspace(-2,2,100), y=sigmoid(X_toy_plt @ toy_theta),
mode="lines", line=dict(color="green", width=4))
], filename="lr-15")
We will use the built in cross-validation support for logistic regression in scikit-learn.
lr = linear_model.LogisticRegressionCV(100)
lr.fit(X,Y)
lr_line = go.Scatter(name="Scikit-LR",
x=X_plt, y=lr.predict_proba(np.array([X_plt]).T).T[1],
mode="lines", line=dict(color="green", width=4))
py.iplot([points, lr_line], filename="lr-16")
plt = ff.create_distplot([
data[data['malignant']]['mean radius'],
data[~data['malignant']]['mean radius']
], group_labels=['malignant', 'benign'], bin_size=0.5)
plt.data.append(go.Scatter(name="Logistic Reg. Decision Boundary",
x = [lr_dec_boundary,lr_dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="red", dash="dot")))
plt.data.append(go.Scatter(name="Least Squares Decision Boundary",
x = [dec_boundary,dec_boundary], y=[0,0.25], mode="lines",
line=dict(color="black", dash="dot")))
plt.data.append(go.Scatter(name="SGD Logistic Reg. Dec. Boundary",
x = [lr_sgd_dec_boundary,lr_sgd_dec_boundary],
y=[0,0.25], mode="lines",
line=dict(color="blue", dash="dot")))
lr_sk_dec_boundary = (0.0 - lr.intercept_)/lr.coef_[0]
plt.data.append(go.Scatter(name="SK Logistic Reg. Dec. Boundary",
x = [lr_sk_dec_boundary,lr_sk_dec_boundary],
y=[0,0.25], mode="lines",
line=dict(color="pink", dash="dot")))
py.iplot(plt, filename="lr-17")
Comparing the prediction accuracy with least squares we see a bit of an improvement.
lr_errors = cross_val_score(linear_model.LogisticRegression(), X, Y,
scoring=make_scorer(zero_one_loss), cv=5)
print("LeastSquares Error:", np.mean(linreg_errors))
print("LR Error:", np.mean(lr_errors))
We can also examine the extreme point example from before:
lr_extreme = linear_model.LogisticRegression()
lr_extreme.fit(X_ex, Y_ex)
print("Least Squares Extreme Point:",
zero_one_loss(Y_ex, least_squares_model_ex.predict(X_ex) > 0.5))
print("LR Extreme Point:",
zero_one_loss(Y_ex, lr_extreme.predict(X_ex) > 0.5))
We temporarily return to a simple synthetic dataset
np.random.seed(42)
nk = 50
Xmc = np.vstack([
np.random.randn(nk,2) + np.array([3,1]),
np.random.randn(nk,2) + np.array([1,3]),
0.5 * np.random.randn(nk,2) ,
0.5 * np.random.randn(nk,2) + np.array([3,3])
])
Ymc = np.hstack([np.zeros(nk), np.ones(nk), 2*np.ones(nk),2*np.ones(nk)])
blue_points = go.Scatter(name = "A", x=Xmc[Ymc==0,0], y=Xmc[Ymc==0,1],
mode="markers", marker=dict(opacity=0.5, color="blue"))
red_points = go.Scatter(name = "B", x=Xmc[Ymc==1,0], y=Xmc[Ymc==1,1],
mode="markers", marker=dict(opacity=0.5, color="red"))
orange_points = go.Scatter(name = "C", x=Xmc[Ymc==2,0], y=Xmc[Ymc==2,1],
mode="markers", marker=dict(opacity=0.5, color="orange"))
py.iplot([blue_points, red_points, orange_points], filename="lr-18")
Here we train a soft-max classifier. Currently scikit learn requires that we use an LBFGS solver for soft-max. L-BFGS is a quasi second order method that uses the both the gradient and a diagonal approximation of the second derivative (the Hessian) to solve for the optimal parameters.
lr = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr.fit(Xmc,Ymc)
In the following we plot the corresponding decision surface generated by the basic multiclass logistic regression model. Notice that the limitations of the linear decision surface.
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.vstack((u.flatten(),v.flatten())).T
label = lr.predict(coords)
label_contour = go.Contour(
name="Label ContourSurace",
x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]),
filename="lr-19")
To capture the non-linear structure we randomly sample 20 data points and create an RBF feature centered at that point.
def gaussian_rbf(us, lam=1):
return lambda x: np.array([np.exp(-np.linalg.norm(x - u)**2 / lam**2) for u in us])
num_basis = 20
np.random.seed(42)
rbf_features = gaussian_rbf(Xmc[np.random.choice(range(len(Xmc)), num_basis, replace=False),:])
Phimc = np.array([rbf_features(x) for x in Xmc])
lr_rbf = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_rbf.fit(Phimc,Ymc)
Again fitting the logistic regression model we get a non-linear decision surface in our original space but a linear decision surface in a higher dimensional transformed space.
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
label = lr_rbf.predict(coords)
label_contour = go.Contour(
name="Label ContourSurace",
x = uvalues, y=vvalues, z=np.reshape(label,(len(uvalues), len(vvalues))),
colorscale=[[0.0, 'rgb(100,100,100)'], [0.5, 'rgb(150,150,150)'], [1.0, 'rgb(200,200,200)']]
)
py.iplot(go.Figure(data=[label_contour, blue_points, red_points, orange_points]),
filename="lr-20")
uvalues = np.linspace(-4,7,70)
vvalues = np.linspace(-3,7,70)
(u,v) = np.meshgrid(uvalues, vvalues)
coords = np.array([rbf_features(x) for x in np.vstack((u.flatten(),v.flatten())).T])
prb = lr_rbf.predict_proba(coords)
surfaces = [
go.Surface(name=str(i),
x=u, y=v, z=np.reshape(prb[:,i],(len(uvalues), len(vvalues))),
showscale = False
)
for i in range(3)
]
py.iplot(go.Figure(data=surfaces), filename="lr-21")