This notebook is a revised version of the oustanding lecture notebook by Professor Hug.
As with other notebooks we will use the same set of standard imports.
import numpy as np
import pandas as pd
np.random.seed(23)
import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot');
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
For this notebook we will use the classic Iris Dataset. The goal is to predict the species of Iris) based on measurements of the flower.
iris = datasets.load_iris()
column_names = [n.replace("(cm)", "").strip() for n in iris['feature_names']]
iris_data = pd.DataFrame(iris['data'], columns=column_names)
iris_data['species'] = iris['target_names'][iris['target']]
iris_data['target'] = iris['target']
iris_data
px.scatter(iris_data, x="petal length", y="petal width", color="species")
Notice that there are three classes of flower. This is a not a binary classification problem but instead a multiclass classification problem. There are several simple extensions of the logistic regression model to support multiple classes.
The logistic regression model can be extended to multiple classes using several techniques. Perhaps the simplest is the one-versus-rest approach where the multiclass prediction problem is divided into separate binary prediction problems.
from sklearn.linear_model import LogisticRegression
lr_setosa = LogisticRegression(solver='lbfgs')
lr_setosa.fit(iris_data[['petal length', 'petal width']],
iris_data['species'] == 'setosa')
lr_versicolor = LogisticRegression(solver='lbfgs')
lr_versicolor.fit(iris_data[['petal length', 'petal width']],
iris_data['species'] == 'versicolor')
lr_virginica = LogisticRegression(solver='lbfgs')
lr_virginica.fit(iris_data[['petal length', 'petal width']],
iris_data['species'] == 'virginica');
def predict_class(X):
most_likely_class = np.argmax(np.vstack([
lr_setosa.predict_proba(X)[:,1],
lr_versicolor.predict_proba(X)[:,1],
lr_virginica.predict_proba(X)[:,1]
]), axis=0)
return iris['target_names'][most_likely_class]
predict_class(iris_data[['petal length', 'petal width']])
How accurate is the model?
Y = iris_data['species']
Y_hat = predict_class(iris_data[['petal length', 'petal width']])
accuracy = np.mean(Y == Y_hat)
print("Prediction Accuracy:", accuracy)
Scikit-learn has a built-in implementation of one versus rest.
lr_model = LogisticRegression(multi_class = 'ovr', solver='lbfgs')
lr_model.fit(iris_data[["petal length", "petal width"]],
iris_data["species"])
Y_hat = lr_model.predict(iris_data[['petal length', 'petal width']])
accuracy = np.mean(Y == Y_hat)
print("Prediction Accuracy:", accuracy)
We can also visualize the predictions. The following code constructs a plot illustrating the decision we would make for each possible value of petal length
and petal width
. You don't need to understand the details of the code but the basic idea is evaluate the model on a grid (mesgrid
) of features and color the plot the color the integer value for that feature.
def plot_decision_boundaries(model, X, n=50):
categories, z_int = np.unique(Y, return_inverse=True)
# Make contour plot
u = np.linspace(X[:,0].min()-0.5, X[:,0].max()+0.5, n)
v = np.linspace(X[:,1].min()-0.5, X[:,1].max()+0.5, n)
us,vs = np.meshgrid(u, v)
X_test = np.c_[us.ravel(), vs.ravel()]
z_str = model.predict(X_test)
categories, z_int = np.unique(z_str, return_inverse=True)
return go.Contour(x=X_test[:,0], y=X_test[:,1], z=z_int,
# contours=dict(start=0,end=2,size=1),
colorscale=px.colors.qualitative.Plotly[:3],
showscale=False,
)
In the following plot we can see the data points and the predicted class assignment for all combinations of peta width and petal length.
fig = px.scatter(iris_data, x="petal length", y="petal width", color="species")
fig.update_traces(marker=dict(size=12, line=dict(width=2, color='black')),
selector=dict(mode='markers'))
fig.add_trace(
plot_decision_boundaries(lr_model,
iris_data[["petal length", "petal width"]].to_numpy())
)
In lecture we introduced decision trees. In this part of the notebook, we walk through the construction of a decision tree using scikit-learn. The scikit-learn decision tree overview provides a good overview of decision trees and is worth skimming.
from sklearn import tree
dt_model = tree.DecisionTreeClassifier()
dt_model.fit(iris_data[["petal length", "petal width"]],
iris_data["species"])
Notice that there are many hyperparameters that we can configure with decision trees. The parameters that you may want to pay attention to are max_depth
and min_samples_split
which control overfitting. max_depth
determines how many times to divide the feature space (deeper models may overfit) and min_samples_split
also determines the depth of the tree by preventing further splits when there are too few samples.
As with logistic regression we can make predictions using the predict
function:
dt_model.predict(iris_data[["petal length", "petal width"]])
One of the big advantages of decision trees is that they can be (assuming they are not too deep) interpretable while also fitting complex data. The following code creates a visualization of the tree:
import graphviz
dot_data = tree.export_graphviz(dt_model, out_file=None,
feature_names=["petal length", "petal width"],
class_names=["setosa", "versicolor", "virginica"],
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
#graph.render(format="png", filename="iris_tree")
graph
Notice at the first level that if $\textbf{petal_width} \leq 0.8$ then the iris is always a setosa
. Then as we move further down the tree we are able to split the data into leaf nodes of just one type.
If you look carefully, you can find a leaf (leaf 7 from left to right) which has a value array that has non-zero entries in multiple classes. Why didn't the leaf get further divided? Let's examine that leaf more carefully. We can extract the tree and identify the data with high impurity:
t = dt_model.tree_
leaves = t.apply(iris_data[["petal length", "petal width"]].to_numpy().astype('float32'))
impure_ind = t.impurity[leaves] > 0
iris_data.loc[impure_ind, ["petal length", "petal width", "species"]]
Or, we can use the predict_proba
function to return the prbabilities. Note that only the impure leaves will will have probability less than 1:
impure_ind = dt_model.predict_proba(iris_data[["petal length", "petal width"]]).max(axis=1) < 1.
iris_data.loc[impure_ind, ["petal length", "petal width", "species"]]
In either case we see that it would not be possible to divide the leaf further since all the flowers have the same features but different classes.
We can also visualize the decision surface.
fig = px.scatter(iris_data, x="petal length", y="petal width", color="species")
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.add_trace(
plot_decision_boundaries(dt_model,
iris_data[["petal length", "petal width"]].to_numpy())
)
Notice that the decision boundaries are axis-aligned. Why is this? Recall that we are dividing on one dimension at each node in the tree.
We can also compute the final accuracy using scikit-learn.
from sklearn.metrics import accuracy_score
predictions = dt_model.predict(iris_data[["petal length", "petal width"]])
accuracy_score(predictions, iris_data["species"])
Let's examine overfitting with decision trees:
from sklearn.model_selection import train_test_split
train_iris_data, test_iris_data = train_test_split(iris_data, test_size=0.25, random_state=42)
#sort so that the color labels match what we had in the earlier part of lecture
train_iris_data = train_iris_data.sort_values(by="species")
test_iris_data = test_iris_data.sort_values(by="species")
from sklearn import tree
dt_model = tree.DecisionTreeClassifier()
dt_model.fit(train_iris_data[["petal length", "petal width"]], train_iris_data["species"])
fig = px.scatter(train_iris_data, x="petal length", y="petal width", color="species")
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.add_trace(
plot_decision_boundaries(dt_model,
iris_data[["petal length", "petal width"]].to_numpy())
)
fig.update_layout(title="Training Data")
fig = px.scatter(test_iris_data, x="petal length", y="petal width", color="species")
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.add_trace(
plot_decision_boundaries(dt_model,
iris_data[["petal length", "petal width"]].to_numpy())
)
fig.update_layout(title="Test Data")
accuracy_score(dt_model.predict(train_iris_data[["petal length", "petal width"]]), train_iris_data["species"])
predictions = dt_model.predict(test_iris_data[["petal length", "petal width"]])
accuracy_score(predictions, test_iris_data["species"])
from sklearn import tree
sepal_dt_model = tree.DecisionTreeClassifier()
sepal_dt_model.fit(train_iris_data[["sepal length", "sepal width"]], train_iris_data["species"])
sns.scatterplot(data = iris_data, x = "sepal length", y="sepal width", hue="species", legend=False)
fig = plt.gcf()
fig.savefig("iris_scatter_plot_with_petal_data_sepal_only.png", dpi=300, bbox_inches = "tight")
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
np.arange(1.9, 4.5, 0.02))
Z_string = sepal_dt_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
fig = plt.gcf()
fig.savefig("iris_sepal_decision_boundaries_no_data.png", dpi=300, bbox_inches = "tight")
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
np.arange(1.9, 4.5, 0.02))
Z_string = sepal_dt_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = train_iris_data, x = "sepal length", y="sepal width", hue="species", legend=False)
fig = plt.gcf()
fig.savefig("iris_sepal_decision_boundaries_model_training_only.png", dpi=300, bbox_inches = "tight")
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
np.arange(1.9, 4.5, 0.02))
Z_string = sepal_dt_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = test_iris_data, x = "sepal length", y="sepal width", hue="species", legend=False)
fig = plt.gcf()
fig.savefig("iris_sepal_decision_boundaries_model_test_only.png", dpi=300, bbox_inches = "tight")
#fig = plt.gcf()
#fig.savefig("iris_decision_boundaries_model_train_test_split.png", dpi=300, bbox_inches = "tight")
dot_data = tree.export_graphviz(sepal_dt_model, out_file=None,
feature_names=["sepal_length", "sepal_width"],
class_names=["setosa", "versicolor", "virginica"],
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
# graph.render(format="png", filename="sepal_tree")
graph
accuracy_score(sepal_dt_model.predict(train_iris_data[["sepal length", "sepal width"]]),
train_iris_data["species"])
accuracy_score(sepal_dt_model.predict(test_iris_data[["sepal length", "sepal width"]]),
test_iris_data["species"])
dt_model_4d = tree.DecisionTreeClassifier()
all_features = ["petal length", "petal width", "sepal length", "sepal width"]
dt_model_4d.fit(train_iris_data[all_features], train_iris_data["species"])
predictions = dt_model_4d.predict(train_iris_data[all_features])
accuracy_score(predictions, train_iris_data["species"])
predictions = dt_model_4d.predict(test_iris_data[all_features])
accuracy_score(predictions, test_iris_data["species"])
dot_data = tree.export_graphviz(dt_model_4d, out_file=None,
feature_names=all_features,
class_names=["setosa", "versicolor", "virginica"],
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
graph.render(format="png", filename="iris_4d_tree")
def entropy(x):
normalized_x = x / np.sum(x)
return sum(-normalized_x * np.log2(normalized_x))
-np.log2(0.33)*0.33
-np.log2(0.36)*0.36
entropy([34, 36, 40])
entropy([149, 1, 1])
entropy([50, 50])
entropy([50, 50, 50])
entropy([31, 4, 1])
#entropy([50, 46, 3])
#entropy([4, 47])
#entropy([41, 50])
#entropy([50, 50])
def weighted_average_entropy(x1, x2):
N1 = sum(x1)
N2 = sum(x2)
N = N1/(N1 + N2)
return (N1 * entropy(x1) + N2 * entropy(x2)) / (N1 + N2)
weighted_average_entropy([50, 46, 3], [4, 47])
weighted_average_entropy([50, 9], [41, 50])
weighted_average_entropy([2, 50, 50], [48])
weighted_average_entropy([50, 50], [50])
ten_decision_tree_models = []
ten_training_sets = []
for i in range(10):
current_model = tree.DecisionTreeClassifier()
temp_iris_training_data, temp_iris_test_data = np.split(iris_data.sample(frac=1), [110])
temp_iris_training_data = temp_iris_training_data.sort_values("species")
current_model.fit(temp_iris_training_data[["sepal length", "sepal width"]], temp_iris_training_data["species"])
ten_decision_tree_models.append(current_model)
ten_training_sets.append(temp_iris_training_data)
def plot_decision_tree(decision_tree_model, data = None, disable_axes = False):
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
np.arange(1.9, 4.5, 0.02))
Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
if data is not None:
sns.scatterplot(data = data, x = "sepal length", y="sepal width", hue="species", legend=False)
if disable_axes:
plt.axis("off")
# if disable_axes:
#
# plt.gca().xaxis.label.set_visible(False)
# plt.gca().yaxis.label.set_visible(False)
m_num = 0
plot_decision_tree(ten_decision_tree_models[m_num], ten_training_sets[m_num])
plt.savefig("random_forest_model_1_example.png", dpi = 300, bbox_inches = "tight")
m_num = 7
plot_decision_tree(ten_decision_tree_models[m_num], ten_training_sets[m_num])
plt.savefig("random_forest_model_2_example.png", dpi = 300, bbox_inches = "tight")
import matplotlib.gridspec as gridspec
gs1 = gridspec.GridSpec(3, 3)
gs1.update(wspace=0.025, hspace=0.025) # set the spacing between axes.
for i in range(0, 9):
plt.subplot(gs1[i]) #3, 3, i)
plot_decision_tree(ten_decision_tree_models[i], None, True)
plt.savefig("random_forest_model_9_examples.png", dpi = 300, bbox_inches = "tight")