import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# set numpy random seed so that this notebook is deterministic
np.random.seed(21)
iris_data = pd.read_csv("iris.csv")
iris_data.sample(5)
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
92 | 5.8 | 2.6 | 4.0 | 1.2 | versicolor |
44 | 5.1 | 3.8 | 1.9 | 0.4 | setosa |
7 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
21 | 5.1 | 3.7 | 1.5 | 0.4 | setosa |
95 | 5.7 | 3.0 | 4.2 | 1.2 | versicolor |
sns.scatterplot(data = iris_data, x = "petal_length", y="petal_width", hue="species")
<AxesSubplot:xlabel='petal_length', ylabel='petal_width'>
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(multi_class = 'ovr')
logistic_regression_model = logistic_regression_model.fit(iris_data[["petal_length", "petal_width"]], iris_data["species"])
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
np.arange(0, 2.8, 0.02))
Z_string = logistic_regression_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = iris_data, x = "petal_length", y="petal_width", hue="species")
plt.xlim(0, 7);
plt.ylim(0, 2.8);
iris_data[["petal_length", "petal_width"]]
petal_length | petal_width | |
---|---|---|
0 | 1.4 | 0.2 |
1 | 1.4 | 0.2 |
2 | 1.3 | 0.2 |
3 | 1.5 | 0.2 |
4 | 1.4 | 0.2 |
... | ... | ... |
145 | 5.2 | 2.3 |
146 | 5.0 | 1.9 |
147 | 5.2 | 2.0 |
148 | 5.4 | 2.3 |
149 | 5.1 | 1.8 |
150 rows × 2 columns
logistic_regression_model.predict([[1.4, 0.2]])
array(['setosa'], dtype=object)
from sklearn import tree
decision_tree_model = tree.DecisionTreeClassifier(criterion='entropy')
decision_tree_model = decision_tree_model.fit(iris_data[["petal_length", "petal_width"]], iris_data["species"])
four_random_rows = iris_data.sample(4)
four_random_rows
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
142 | 5.8 | 2.7 | 5.1 | 1.9 | virginica |
113 | 5.7 | 2.5 | 5.0 | 2.0 | virginica |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
decision_tree_model.predict(four_random_rows[["petal_length", "petal_width"]])
array(['virginica', 'virginica', 'virginica', 'setosa'], dtype=object)
tree.plot_tree(decision_tree_model, feature_names = ["petal_length", "petal_width"],
class_names = ["setosa", "versicolor", "virginica"],
rounded = True, filled = True)
plt.gcf().savefig('tree-plot.png', dpi = 300, bbox_inches = "tight")
import graphviz
import graphviz
dot_data = tree.export_graphviz(decision_tree_model, out_file=None,
feature_names=["petal_length", "petal_width"],
class_names=["setosa", "versicolor", "virginica"],
filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph.render(format="png", filename="iris_tree")
graph
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
np.arange(0, 2.8, 0.02))
Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = iris_data, x = "petal_length", y="petal_width", hue="species")
#fig = plt.gcf()
#fig.savefig("iris_decision_boundaries.png", dpi=300, bbox_inches = "tight")
<matplotlib.axes._subplots.AxesSubplot at 0x7ff01b204438>
from sklearn.metrics import accuracy_score
predictions = decision_tree_model.predict(iris_data[["petal_length", "petal_width"]])
accuracy_score(predictions, iris_data["species"])
0.9933333333333333
iris_data.query("petal_length > 2.45 and petal_width > 1.75 and petal_length <= 4.85")
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
70 | 5.9 | 3.2 | 4.8 | 1.8 | versicolor |
126 | 6.2 | 2.8 | 4.8 | 1.8 | virginica |
138 | 6.0 | 3.0 | 4.8 | 1.8 | virginica |
Qualitative look
sns.scatterplot(data = iris_data, x = "sepal_length", y="sepal_width", hue="species", legend=False)
fig = plt.gcf()
fig.savefig("iris_scatter_plot_all_150_points_sepal_only.png", dpi=300, bbox_inches = "tight")
sepal_decision_tree_model = tree.DecisionTreeClassifier()
sepal_decision_tree_model = decision_tree_model.fit(iris_data[["sepal_length", "sepal_width"]], iris_data["species"])
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(4, 8, 0.02),
np.arange(1.9, 4.5, 0.02))
Z_string = sepal_decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = iris_data, x = "sepal_length", y="sepal_width", hue="species", legend=False)
fig = plt.gcf()
fig.savefig("iris_sepal_decision_boundaries_all_150_points.png", dpi=300, bbox_inches = "tight")
/home/hug/miniconda3/envs/ds/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names warnings.warn(
train_iris_data, test_iris_data = np.split(iris_data.sample(frac=1), [110])
#sort so that the color labels match what we had in the earlier part of lecture
train_iris_data = train_iris_data.sort_values(by="species")
test_iris_data = test_iris_data.sort_values(by="species")
len(train_iris_data)
110
train_iris_data.head(5)
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
46 | 5.1 | 3.8 | 1.6 | 0.2 | setosa |
15 | 5.7 | 4.4 | 1.5 | 0.4 | setosa |
34 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
24 | 4.8 | 3.4 | 1.9 | 0.2 | setosa |
37 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
from sklearn import tree
decision_tree_model = tree.DecisionTreeClassifier()
decision_tree_model = decision_tree_model.fit(train_iris_data[["petal_length", "petal_width"]], train_iris_data["species"])
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
np.arange(0, 2.8, 0.02))
Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = train_iris_data, x = "petal_length", y="petal_width", hue="species")
#fig = plt.gcf()
#fig.savefig("iris_decision_boundaries_model_train_test_split_training_only.png", dpi=300, bbox_inches = "tight")
<matplotlib.axes._subplots.AxesSubplot at 0x7ff01b29a668>
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
np.arange(0, 2.8, 0.02))
Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = iris_data, x = "petal_length", y="petal_width", hue="species")
#fig = plt.gcf()
#fig.savefig("iris_decision_boundaries_model_train_test_split.png", dpi=300, bbox_inches = "tight")
<matplotlib.axes._subplots.AxesSubplot at 0x7ff01b0f7e80>
from matplotlib.colors import ListedColormap
sns_cmap = ListedColormap(np.array(sns.color_palette())[0:3, :])
xx, yy = np.meshgrid(np.arange(0, 7, 0.02),
np.arange(0, 2.8, 0.02))
Z_string = decision_tree_model.predict(np.c_[xx.ravel(), yy.ravel()])
categories, Z_int = np.unique(Z_string, return_inverse=True)
Z_int = Z_int
Z_int = Z_int.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z_int, cmap=sns_cmap)
sns.scatterplot(data = test_iris_data, x = "petal_length", y="petal_width", hue="species")
#fig = plt.gcf()
#fig.savefig("iris_decision_boundaries_model_train_test_split_test_only.png", dpi=300, bbox_inches = "tight")
<matplotlib.axes._subplots.AxesSubplot at 0x7ff01b1e1748>
accuracy_score(decision_tree_model.predict(train_iris_data[["petal_length", "petal_width"]]), train_iris_data["species"])
0.990909090909091
predictions = decision_tree_model.predict(test_iris_data[["petal_length", "petal_width"]])
accuracy_score(predictions, test_iris_data["species"])
0.975
from sklearn import tree
sepal_decision_tree_model = tree.DecisionTreeClassifier()
sepal_decision_tree_model = decision_tree_model.fit(train_iris_data[["sepal_length", "sepal_width"]], train_iris_data["species"])
sns.scatterplot(data = iris_data, x = "sepal_length", y="sepal_width", hue="species", legend=False)
fig = plt.gcf()
fig.savefig("iris_scatter_plot_with_petal_data_sepal_only.png", dpi=300, bbox_inches = "tight")