In the previous notebook we discussed why least-squares linear regression is not well suited to modeling classification tasks. In this notebook we introduce logistic regression for modeling classification tasks.
from IPython.display import YouTubeVideo
YouTubeVideo("TIXc813Ql-4")
As with other notebooks we will use the same set of standard imports.
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot');
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
We will continue to use the Wisconsin Breast Cancer Dataset.
import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0)
data
Always split your data into training and test groups.
from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.10, random_state=42)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))
Creating the X
and Y
matrices for the training data:
X = data_tr[['mean radius']].to_numpy()
Y = data_tr['malignant'].astype(float).to_numpy()
Visualizing the data
points = go.Scatter(x=X.flatten(), y = Y,
mode="markers",
marker=dict(opacity=0.5))
layout = dict(xaxis=dict(title="Mean Radius"),yaxis=dict(title="Malignant"))
go.Figure(data=[points], layout=layout)
In the previous lecture we implemented a jitter
function to jitter the data to make it easier to visualize.
def jitter(data, amt=0.1):
return data + amt * np.random.rand(len(data)) - amt/2.0
points = go.Scatter(x=X.flatten(), y = jitter(Y),
mode="markers",
marker=dict(opacity=0.5))
layout = dict(xaxis=dict(title="Mean Radius"),yaxis=dict(title="Malignant"))
go.Figure(data=[points], layout=layout)
This can be a little missleading so let's try a different visualization for this lecture:
points = go.Scatter(name="Training Data", x=X.flatten(), y = Y,
mode="markers",
marker=dict(symbol="line-ns", size=5, line=dict(width=1, color="darkblue"))
)
layout = dict(xaxis=dict(title="Mean Radius"),yaxis=dict(title="Malignant"))
go.Figure(data=[points], layout=layout)
lin_reg = LinearRegression()
lin_reg.fit(X,Y)
Plotting the predictions:
X_plt = np.expand_dims(np.linspace(X.min(), X.max(), 100),1)
model_line = go.Scatter(name="Least Squares",
x=X_plt.flatten(), y=lin_reg.predict(X_plt),
mode="lines", line=dict(color="orange"))
go.Figure([points, model_line], layout=layout)
Recall from the previous notebook, we noted several problems with the use of least squares linear regression for classification.
In this notebook we will address these issues.
A natural starting place for modeling a categorical variable (e.g., whether a tumor is benign or malignant) is to model the probability of whether it is benign or malignant. We could start with the simplest model predict a constant probability:
YouTubeVideo("WJnz-ELJ5e4")
pr_malignant = np.mean(Y)
print("Proability of being malignant:", pr_malignant)
def constant_pr_model(X):
return pr_malignant * np.ones(len(X))
Thus for any radius we would just return the same probability.
constant_prob_line = go.Scatter(name="Constant Probability",
x=X_plt.flatten(), y=constant_pr_model(X),
mode="lines", line=dict(color="orange"))
go.Figure([points, constant_prob_line], layout=layout)
The above constant model doesn't depend on the mean radius. We could improve upon this model by computing a constant for different bins of the mean radius. The following block of code divides the x-axis (mean radius) into 10 regions and computes the proportion of malignant tumors in each region.
X_splits = np.linspace(6.5, 28.5, 15)
pr_mal_split = np.zeros(len(X_splits))
for i in range(0, len(X_splits)-1):
pr_mal_split[i] = np.mean(Y[((X > X_splits[i]) & (X <= X_splits[i+1])).flatten()])
fig = go.Figure([points], layout=layout)
for i in range(len(X_splits)):
fig.add_shape(type="line", x0=X_splits[i], x1=X_splits[i], y0=-.2, y1=1.2, line=dict(color="LightSeaGreen",
width=1,
dash="dashdot",
))
# fig.add_trace(go.Scatter(name = "Prop. Malignant", x=X_middle, y=pr_mal_split))
splits_plot = go.Scatter(name = "Prop. Malignant Split",
x=np.vstack([X_splits[:-1], X_splits[1:]]).T.flatten(),
y=np.vstack([pr_mal_split, pr_mal_split]).T.flatten(),
line=dict(color="orange", width=3))
fig.add_trace(splits_plot)
fig
This is actually a pretty reasonable model but if we had higher dimensional features, dividing the space into bins would get exponentially more expensive. In addition, many (most) of the bins would have no points so it would be difficult to estimate the proportions in that bin.
Rather than dividing the space into bins we could instead consider the proportion of tumors that are malignant and have "similar" features (e.g., mean radius) to the tumor for which we would like to make a prediction:
from numpy.linalg import norm
import heapq
def knearest_neighbors(x, X_tr, Y_tr, k=5):
# Compute the distance
dist = norm(x - X_tr, axis=1)
# Predict the average Y value of the k closest data points to x
return np.mean(Y_tr[heapq.nsmallest(k, range(len(dist)), dist.__getitem__)])
fig.add_trace(
go.Scatter(name = "K Nearest Neighbors",
x=X_plt.flatten(),
y=[knearest_neighbors(x, X, Y, 91) for x in X_plt.flatten()],
line=dict(color="red", width=3))
)
fig
Logistic regression is probably one of the most widely used basic models for classification and is a simple extension of linear models to the classification problem. In the remainder of this notebook we walk through the logistic function and how to fit logistic regression models using scikit-learn.
YouTubeVideo("U4TeibU_Q60")
The logistic regression model predicts the probability that the binary $Y$ variable (e.g., is the tumor malignant) will take the value 1 given the features $x$ (e.g., the mean radius of the tumor).
Just as with our linear model, the above function is parameterized by the vector $\theta$ and is a function of our basic linear model $\sum_{k=0}^d \theta_k x_k$. However, this is no longer a linear model but instead often called a generalized linear model.
The logistic regression model derives its name from the logistic function:
The logistic function is also often called the sigmoid function and denoted $\sigma(t)$. Sadly, the greek letter $\sigma$ is widely used to mean a lot of things (e.g., standard deviation, logistic function, permutations) and so you will have to guess the meaning from context.
We can rewrite the logistic regression model in the form:
where the logistic function maps the output of the linear model (which could be any real number) to the interval between 0 and 1 and is commonly used when modeling probabilities.
We can plot the logistic to see it's characteristic (s-shape, sigmoid-shape):
def logistic(t):
return 1. / (1. + np.exp(-t))
t = np.linspace(-5,5,50)
sigmoid_line = go.Scatter(name="Logistic Function",
x=t, y=logistic(t), mode="lines")
go.Figure([sigmoid_line])
To get an intuition for the behavior of the parameters in the logistic regression model let's consider a simple one dimensional logistic regression model of the form:
The following two plots allow us to vary $\theta_0$ and $\theta_1$
fig = go.Figure()
for theta1 in [-1,1, 5]:
for theta0 in [-2, 0, 2]:
fig.add_trace(go.Scatter(name=f"{theta0} + {theta1} x", x=t, y=logistic(theta0 + theta1*t)))
fig
Because the logistic regression model predicts a probability we typically use the negative log-likelihood as the loss function. This is also called the cross entropy loss and is written as:
Unlike the squared loss, there is no closed form solution to this loss function and so iterative methods like gradient descent are typically used to minimize the loss function with respect to the data.
In general, you should use scikit-learn or other frameworks that have specialized implementations for logistic regression model fitting. This is because you can often use more advanced iterative algorithms to fit the logistic regression model. However, to demonstrate how these iterative methods work, we can implement logistic regression using PyTorch and solve for the optimal parameters using stochastic gradient descent:
YouTubeVideo("thEZGXIfJqc")
Defining the Logistic Model
import torch
from torch import nn
class LogisticModel(nn.Module):
def __init__(self, w=None):
super().__init__()
# Creating a nn.Parameter object allows torch to track parameters for us
if w is not None:
self.w = nn.Parameter(torch.from_numpy(w))
else:
self.w = nn.Parameter(torch.zeros(2,1))
def forward(self, x):
w = self.w
return 1/(1 + torch.exp(-(w[0] + w[1] * x)))
torch_lr_model = LogisticModel(np.array([0.,1.]))
torch_lr_model.forward(3)
Defining the Cross Entropy Loss
def torch_cross_entropy_loss(P_hat, Y):
return -torch.sum(Y * torch.log(P_hat) + (1-Y) * torch.log(1 - P_hat))
Converting Data to PyTorch Tensors
from torch.utils.data import TensorDataset, DataLoader
tensor_data = TensorDataset(torch.from_numpy(X.flatten()),
torch.from_numpy(Y))
Implementing Gradient Descent
from torch.optim import Adam, SGD
def adam_sgd(model, loss_fn, dataset, lr=.1, nepochs=100, batch_size=10):
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
opt = Adam(model.parameters(), lr=lr)
for i in range(nepochs):
for (x, y) in loader:
loss = loss_fn(model(x), y)
loss.backward()
opt.step()
model.zero_grad()
Run the Optimizer on The Model
torch_lr_model = LogisticModel(np.array([0.,1.]))
adam_sgd(torch_lr_model, torch_cross_entropy_loss, tensor_data, lr=0.1)
torch_lr_model.w
Plot Predictions
with torch.no_grad():
torch_p_hats = torch_lr_model.forward(torch.from_numpy(X_plt.flatten())).numpy()
fig = go.Figure([points], layout=layout)
pytorch_lr_line = go.Scatter(name = "Pytorch Logistic Regression",
x=X_plt.flatten(), y=torch_p_hats,
line=dict(color="cyan", width=3))
fig.add_trace(pytorch_lr_line)
fig
Just as with linear regression, L1 and L2 regularization can and are often applied to the logistic regression loss. Later we will see why L2 regularization is almost always used with logistic regression.
Logistic regression models in scikit-learn follow the same API as all other models and therefore you already know how to use them. First we import the logistic regression model. Notice that we are importing it from the linear models module. Logistic regression isn't technically a linear model but instead a generalized linear model (a linear model with a non-linear transformation).
YouTubeVideo("tQL4rCD6PFU")
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(solver="lbfgs")
lr_model.fit(X, Y)
lr_model.coef_
lr_model.intercept_
When making a prediction the predict
function returns the predicted (most likely class). while the predict_proba
returns the predicted probability.
lr_model.predict(np.array([[20]]))
lr_model.predict_proba(np.array([[12]]))
Plotting the Logistic Regression Predictions
fig = go.Figure([points, pytorch_lr_line], layout=layout)
p_hats = lr_model.predict_proba(X_plt)
lr_line = go.Scatter(name = "Sklearn Logistic Regression",
x=X_plt.flatten(), y=p_hats[:,1],
line=dict(color="orange", width=3))
fig.add_trace(lr_line)
fig
Notice the curves are slightly different. There are a few reasons. First, the Pytorch optimization above is not as well tuned as the LBFGS solver that sklearn is using. Second, and perhaps more important sklearn is using a small amount of L2 regularization.
#help(lr_model)
Supposed we had the following toy data:
toy_X = np.array([[-1.0, -0.2, 0.2, 1.0]])
toy_Y = np.array([0.0, 0.0, 1.0, 1.0])
toy_points = go.Scatter(name="Toy Data", x=toy_X.flatten(), y=toy_Y, mode='markers',
marker=dict(size=10))
go.Figure([toy_points])
Let's consider a simplified logistic regression model of the form:
$$ \hat{P}\left(Y=1 \,|\, x\right) =\sigma\left(\theta_1 x\right) = \frac{1}{1+\exp\left(- \theta_1 x\right)} $$def toy_model(theta1, x):
return 1/(1 + np.exp(-theta1 * x))
def cross_entropy_loss(theta1, x, y):
# Here we use 1 - sigma(t) = sigma(-t) to improve numerical stability
return - np.sum(y * np.log(toy_model(theta1, x)) + (1-y) * np.log(toy_model(theta1, -x)))
If we try a range of values for $\theta_1$ we notice that we can keep making $\theta_1$ larger and further reducing the loss
theta1s = np.array([1, 2, 5, 10, 15, 20, 25, 50, 100])
loss_curve = go.Scatter(x=theta1s, y=[cross_entropy_loss(t, toy_X, toy_Y) for t in theta1s])
go.Figure([loss_curve], layout=dict(yaxis=dict(title="Cross Entropy Loss", type="log"),
xaxis=dict(title=r"$$\theta_1$$")))
This corresponds to the probabilities moving closer to 0 and 1:
toy_xplt = np.linspace(-1.2,1.2,100)
fig = go.Figure([toy_points])
for t in theta1s:
fig.add_trace(go.Scatter(name=f"theta1={t}", x=toy_xplt, y=toy_model(t, toy_xplt)))
fig
In the limit the sigmoid curve will become a step function. If we introduce a neglible amount of L2 regularization we can avoid this behavior.
def cross_entropy_loss_with_reg(theta1, x, y):
# Here we use 1 - sigma(t) = sigma(-t) to improve numerical stability
return - np.sum(y * np.log(toy_model(theta1, x)) + (1-y) * np.log(toy_model(theta1, -x))) + 1.0e-5 * theta1**2
theta1s = np.array([1, 2, 5, 10, 15, 20, 25, 50, 100])
loss_curve = go.Scatter(x=theta1s, y=[cross_entropy_loss_with_reg(t, toy_X, toy_Y) for t in theta1s])
go.Figure([loss_curve], layout=dict(yaxis=dict(title="Cross Entropy Loss", type="log"),
xaxis=dict(title=r"$$\theta_1$$")))
In the last part of this notebook we walk through how we use the predicted probabilities to make a decision.
YouTubeVideo("f6hLrFChkXU")
How do we use the probability to decide whether to classify a tumor as benign or malignant? The sklearn logistic regression model predict
function implements a basic decision rule:
Predict 1 iff $\hat{P}\left(Y=1 \,|\, x\right) > 0.5$ otherwise predict 0.
np.all(lr_model.predict(X) == np.where(lr_model.predict_proba(X)[:,1] > 0.5, 1.0, 0.0))
We could generalize this decision rule:
Predict 1 iff $\hat{P}\left(Y=1 \,|\, x\right) > \tau$ otherwise predict 0.
for any choice of $\tau$. Is $\tau = 0.5$ the best threshold? It depends on our goals. Suppose we wanted to maximize accuracy. The accuracy of our classifier is defined as the fraction of correct predictions. We can compute the accuracy for different thresholds:
def threshold_predict(model, X, threshold):
return np.where(lr_model.predict_proba(X)[:,1] > threshold, 1.0, 0.0)
def accuracy(threshold, X, Y):
return np.mean(threshold_predict(lr_model, X, threshold) == Y)
thresholds = np.linspace(0, 1, 100)
accs = [accuracy(t, X, Y) for t in thresholds]
fig = px.line(x=thresholds, y=accs)
fig.update_xaxes(title="threshold")
fig.update_yaxes(title="Accuracy")
In practice we should use cross validation:
def make_scorer(threshold, metric):
return lambda model, x, y: metric(y, threshold_predict(model, x, threshold))
from sklearn.model_selection import cross_val_score
from sklearn import metrics
cv_accs = [
np.mean(cross_val_score(lr_model, X, Y,
scoring=make_scorer(t, metrics.accuracy_score),
cv=5))
for t in thresholds
]
fig = px.line(x=thresholds, y=cv_accs)
fig.update_xaxes(title="threshold")
fig.update_yaxes(title="Accuracy")
Notice that the threshold with the highest accuracy is not necessarily at 0.5. This can occur for many reasons but it is likely due here to class imbalance. There are fewer malginant tumors and so we want to be more confident before classifying a tumor as malignant.
The threshold should typically be tuned using cross validation.
A convenient way to visualize the accuracy of a classification model is to look at the confusion matrix. The confusion matrix compares what the model predicts with the actual counts in each class. The types of error are:
Ideally, we would like to minimize the sources of error but we may also want to manage the balance between these types of error.
Scikit-learn has a function to compute the confusion matrix:
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(Y, lr_model.predict(X))
mat
fig = ff.create_annotated_heatmap(z=mat,
x=["False", "True"], y=["False", "True"],
showscale=True)
fig.update_layout(font=dict(size=18))
# Add Labels
fig.add_annotation(x=0,y=0, text="True Negative",
yshift=40, showarrow=False, font=dict(color="black",size=24))
fig.add_annotation(x=1,y=0, text="False Positive",
yshift=40, showarrow=False, font=dict(color="white",size=24))
fig.add_annotation(x=0,y=1, text="False Negative",
yshift=40, showarrow=False, font=dict(color="white",size=24))
fig.add_annotation(x=1,y=1, text="True Positive",
yshift=40, showarrow=False, font=dict(color="white",size=24))
fig.update_xaxes(title="Predicted")
fig.update_yaxes(title="Actual", autorange="reversed")
In many settings, there might be a much higher cost to missing positive cases. For example, if we were building a tumor classifier we would want to make sure that we don't miss any malignant tumors. We might be prefer to classify benign tumors as malignant since further study could be conducted by pathologist to verify all malignant tumors.
YouTubeVideo("gZmOmgQns3c")
In these settings, we might want to adjust the precision or recall.
The following wikipedia illustration depicts the precision recall tradeoff.
The precision of a model is defined as:
$$ \textbf{Precision} = \frac{\textbf{True Positives}}{\textbf{True Positives} + \textbf{False Positives}} = \frac{\textbf{True Positives}}{\textbf{Predicted True}} $$and captures the accuracy of the model when it is positive. Higher precision models are often more likely to predict that true things are negative (higher false negative rate).
The recall of a model is defined as:
$$ \textbf{Recall} = \frac{\textbf{True Positives}}{\textbf{True Positives} + \textbf{False Negatives}} = \frac{\textbf{True Positives}}{\textbf{Actually True}} $$and captures the ability of the model to predict true on all the true examples. Higher recall runs the risk of predicting true on false examples.
A common analysis is to compare the precision and recall at different thresholds. We can implement functions to compute the precision and recall at different thresholds and then consider all the thresholds given by our predictions and plot the precision versus the recall at each threshold.
def predict_at_threshold(prob, threshold):
return np.where(prob >= threshold, 1., 0.)
def precision_at_threshold(Y, prob, threshold):
Y_hat = predict_at_threshold(prob, threshold)
return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y_hat)
def recall_at_threshold(Y, prob, threshold):
Y_hat = predict_at_threshold(prob, threshold)
return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y)
def precision_recall_curve(Y, prob):
unique_thresh = np.unique(prob)
precision = [precision_at_threshold(Y, prob, t) for t in unique_thresh]
recall = [recall_at_threshold(Y, prob, t) for t in unique_thresh]
return precision, recall, unique_thresh
precision, recall, threshold = precision_recall_curve(Y, lr_model.predict_proba(X)[:, 1])
fig = px.line(x=recall, y = precision, hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig
Naturally, there is an scikit-learn function to compute this tradeoff
precision, recall, threshold = metrics.precision_recall_curve(Y, lr_model.predict_proba(X)[:, 1])
The precision and recall are computed for all possible probability thresholds:
fig = px.line(x=recall[:-1], y = precision[:-1], hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig
fig = go.Figure()
fig.add_trace(go.Scatter(name="Precision", x=threshold, y=precision[:-1]))
fig.add_trace(go.Scatter(name="Recall", x=threshold, y=recall[:-1]))
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Proportion")
fig
If we wanted to ensure that 95% of the malignant tumors are classified as malignant we would want to select the following threshold:
mal95_ind = np.argmin(recall >= 0.95)-1
mal95_thresh = threshold[mal95_ind]
mal95_precision = precision[mal95_ind]
mal95_recall = recall[mal95_ind]
print("Threshold:", mal95_thresh)
print("Precision:", mal95_precision)
print("Recall:", mal95_recall)
Notice that we would actually want a pretty low threshold to ensure that we don't miss any malignant tumors. We would then find that roughly 41% (1-precision) of the tumors we classify as malignant would actually be benign. With this threshold what fraction of tumors would need to be processed in our dataset?
print("Proporition of samples below threshold:",
np.mean(lr_model.predict_proba(X)[:,1] < mal95_thresh))
That means that the pathologist would have to verify about 61% of the samples. Using this model, we could reduce the workload in pathology by 39%. However, we would falsely diagnose 5% of the tumors as benign when they are actually malignant and in practice that would be unacceptable.
We could try to impove this model by using additional features:
lr_model_full = LogisticRegression(solver='lbfgs', max_iter=10000)
lr_model_full.fit(data_tr.drop(columns='malignant'), data_tr['malignant'].astype(float))
We can construct the precision-recall curve:
prb_mal = lr_model_full.predict_proba(data_tr.drop(columns='malignant'))[:,1]
precision, recall, threshold = metrics.precision_recall_curve(data_tr['malignant'], prb_mal)
fig = px.line(x=recall[:-1], y = precision[:-1], hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig
Notice that the precision-recall curve is much further to the right. This is a much better model.
What threshold would we need to get 99% coverage?
mal95_ind = np.argmin(recall >= 0.99)-1
mal95_thresh = threshold[mal95_ind]
mal95_precision = precision[mal95_ind]
mal95_recall = recall[mal95_ind]
print("Threshold:", mal95_thresh)
print("Precision:", mal95_precision)
print("Recall:", mal95_recall)
Now that we have finished our modeling process we can see how our model performs on the test data. This would give us a better understanding of how it might perform on new data.
prb_mal = lr_model_full.predict_proba(data_te.drop(columns='malignant'))[:,1]
Y_hat = prb_mal >= mal95_thresh
mat = confusion_matrix(data_te['malignant'], Y_hat)
mat
fig = ff.create_annotated_heatmap(z=mat,
x=["False", "True"], y=["False", "True"],
showscale=True)
fig.update_layout(font=dict(size=18))
# Add Labels
fig.add_annotation(x=0,y=0, text="True Negative",
yshift=40, showarrow=False, font=dict(color="black",size=24))
fig.add_annotation(x=1,y=0, text="False Positive",
yshift=40, showarrow=False, font=dict(color="white",size=24))
fig.add_annotation(x=0,y=1, text="False Negative",
yshift=40, showarrow=False, font=dict(color="white",size=24))
fig.add_annotation(x=1,y=1, text="True Positive",
yshift=40, showarrow=False, font=dict(color="white",size=24))
fig.update_xaxes(title="Predicted")
fig.update_yaxes(title="Actual", autorange="reversed")