Notebook by Josh Hug
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
iris = pd.read_csv("iris.csv")
iris.sample(10)
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
112 | 6.8 | 3.0 | 5.5 | 2.1 | virginica |
117 | 7.7 | 3.8 | 6.7 | 2.2 | virginica |
106 | 4.9 | 2.5 | 4.5 | 1.7 | virginica |
48 | 5.3 | 3.7 | 1.5 | 0.2 | setosa |
135 | 7.7 | 3.0 | 6.1 | 2.3 | virginica |
137 | 6.4 | 3.1 | 5.5 | 1.8 | virginica |
108 | 6.7 | 2.5 | 5.8 | 1.8 | virginica |
39 | 5.1 | 3.4 | 1.5 | 0.2 | setosa |
100 | 6.3 | 3.3 | 6.0 | 2.5 | virginica |
86 | 6.7 | 3.1 | 4.7 | 1.5 | versicolor |
iris = iris.drop("species", axis = 1)
iris.sample(10)
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
92 | 5.8 | 2.6 | 4.0 | 1.2 |
95 | 5.7 | 3.0 | 4.2 | 1.2 |
130 | 7.4 | 2.8 | 6.1 | 1.9 |
58 | 6.6 | 2.9 | 4.6 | 1.3 |
139 | 6.9 | 3.1 | 5.4 | 2.1 |
114 | 5.8 | 2.8 | 5.1 | 2.4 |
32 | 5.2 | 4.1 | 1.5 | 0.1 |
40 | 5.0 | 3.5 | 1.3 | 0.3 |
122 | 7.7 | 2.8 | 6.7 | 2.0 |
102 | 7.1 | 3.0 | 5.9 | 2.1 |
sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", color="black")
plt.xlabel('x')
plt.ylabel('y');
#plt.savefig('2d_data_needing_clustering.png', dpi = 300, bbox_inches = "tight")
class Center():
def __init__(self, data):
"""generates a random center inside the region bounded by the data"""
num_dimensions = data.shape[1]
self.coordinates = np.array([0.0] * num_dimensions)
for i in range(num_dimensions):
min_value = np.min(data[:, i])
max_value = np.max(data[:, i])
random_value = random.uniform(min_value, max_value)
self.coordinates[i] = random_value
def __str__(self):
return str(self.coordinates)
def __repr__(self):
return repr(self.coordinates)
def dist(self, data_point):
return np.sqrt(np.sum((self.coordinates - data_point)**2, axis = 1))
def dist_sq(self, data_point):
return np.sum((self.coordinates - data_point)**2, axis = 1)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
# force coordinates from the lecture demo
c1.coordinates = np.array([2.52364007, 2.31040024])
c2.coordinates = np.array([6.53276402, 1.211463])
def plot_centers_and_black_data(iris, centers):
sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", color="black")
for center in centers:
plt.plot(center.coordinates[0], center.coordinates[1], '*', markersize = 10)
plt.xlabel('petal_length')
plt.ylabel('petal_width')
legend_text = ['c' + str(i) for i in range(1, len(centers) + 1)]
legend_text.append('data')
plt.legend(legend_text)
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_initial_placement.png', dpi = 300, bbox_inches = "tight")
def get_cluster_number(dists):
return np.where(dists == np.min(dists))[0][0]
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
iris.head(10)
sepal_length | sepal_width | petal_length | petal_width | dist1 | dist2 | cluster | |
---|---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 2.390890 | 5.231474 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 2.390890 | 5.231474 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 2.439484 | 5.329623 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 2.345555 | 5.133398 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 2.390890 | 5.231474 | 0 |
5 | 5.4 | 3.9 | 1.7 | 0.4 | 2.080387 | 4.900416 | 0 |
6 | 4.6 | 3.4 | 1.4 | 0.3 | 2.303101 | 5.213064 | 0 |
7 | 5.0 | 3.4 | 1.5 | 0.2 | 2.345555 | 5.133398 | 0 |
8 | 4.4 | 2.9 | 1.4 | 0.2 | 2.390890 | 5.231474 | 0 |
9 | 4.9 | 3.1 | 1.5 | 0.1 | 2.435920 | 5.154034 | 0 |
iris["cluster"].value_counts()
0 79 1 71 Name: cluster, dtype: int64
def plot_centers_and_colorized_data(iris, centers):
current_palette = sns.color_palette()[0:len(centers)]
sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", hue="cluster", palette=current_palette)
for center in centers:
plt.plot(center.coordinates[0], center.coordinates[1], '*', markersize = 10)
plt.xlabel('petal_length')
plt.ylabel('petal_width')
legend_text = ['c' + str(i) for i in range(1, len(centers) + 1)]
plt.legend(legend_text)
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_initial_placement_colored.png', dpi = 300, bbox_inches = "tight")
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)
average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_2.png', dpi = 300, bbox_inches = "tight")
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_2_colorized.png', dpi = 300, bbox_inches = "tight")
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)
average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_3.png', dpi = 300, bbox_inches = "tight")
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_3_colorized.png', dpi = 300, bbox_inches = "tight")
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)
average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_4.png', dpi = 300, bbox_inches = "tight")
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_4_colorized.png', dpi = 300, bbox_inches = "tight")
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)
average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_5.png', dpi = 300, bbox_inches = "tight")
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
plot_centers_and_colorized_data(iris, (c1, c2))
# plt.savefig('2means_demo_center_position_5_colorized.png', dpi = 300, bbox_inches = "tight")
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)
average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
plot_centers_and_black_data(iris, (c1, c2))
# plt.savefig('2means_demo_center_position_6.png', dpi = 300, bbox_inches = "tight")
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)
average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
plot_centers_and_colorized_data(iris, (c1, c2))
# plt.savefig('2means_demo_center_position_6_colorized.png', dpi = 300, bbox_inches = "tight")
import copy
def compute_centers_after_N_iterations(data, column_names, centers, N):
centers = copy.deepcopy(centers)
for i in range(N):
#recompute clusters
dist_names = []
for center_num in range(len(centers)):
data["dist" + str(center_num)] = centers[center_num].dist(data[column_names])
dist_names.append("dist" + str(center_num))
data["cluster"] = data[dist_names].apply(get_cluster_number, axis = 1)
#update centers
for center_num in range(len(centers)):
for col_num in range(len(column_names)):
col_name = column_names[col_num]
centers[center_num].coordinates[col_num] = np.mean(data[data["cluster"] == center_num])[col_name]
return centers
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c1.coordinates = np.array([2.52364007, 2.31040024])
c2.coordinates = np.array([6.53276402, 1.211463])
iris
sepal_length | sepal_width | petal_length | petal_width | dist1 | dist2 | cluster | |
---|---|---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0.112040 | 3.824028 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0.112040 | 3.824028 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0.203412 | 3.916407 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0.061068 | 3.732042 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0.112040 | 3.824028 | 0 |
... | ... | ... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | 4.229889 | 0.676487 | 1 |
146 | 6.3 | 2.5 | 5.0 | 1.9 | 3.870173 | 0.230631 | 1 |
147 | 6.5 | 3.0 | 5.2 | 2.0 | 4.093707 | 0.420388 | 1 |
148 | 6.2 | 3.4 | 5.4 | 2.3 | 4.406168 | 0.779445 | 1 |
149 | 5.9 | 3.0 | 5.1 | 1.8 | 3.920660 | 0.210959 | 1 |
150 rows × 7 columns
def inertia(data, centers):
total_inertia = 0
for center_num in range(len(centers)):
data_in_this_cluster = data[data["cluster"] == center_num]
total_inertia += np.sum(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]]))
return total_inertia
def distortion(data, centers):
total_distortion = 0
for center_num in range(len(centers)):
data_in_this_cluster = data[data["cluster"] == center_num]
total_distortion += np.sum(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]]))/len(data_in_this_cluster)
return total_distortion
random.seed(25)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example1.png", bbox_inches = "tight", dpi=300)
inertia: 44.95723374524917, distortion: 1.254497095901379)
random.seed(29)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example2.png", bbox_inches = "tight", dpi=300)
inertia: 45.94868633864745, distortion: 1.3083110705058751)
random.seed(40)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example3.png", bbox_inches = "tight", dpi=300)
inertia: 54.34774261570242, distortion: 1.50090424867691)
random.seed(75)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example4.png", bbox_inches = "tight", dpi=300)
inertia: 44.95723374524917, distortion: 1.2544970959013793)
random.seed(20)
np.random.seed(20)
iris_small = iris.sample(7)
c1 = Center(iris_small.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris_small.loc[:, ['petal_length', 'petal_width']].values)
new_centers2 = compute_centers_after_N_iterations(iris_small, ['petal_length', 'petal_width'], [c1, c2], 12)
plot_centers_and_colorized_data(iris_small, new_centers2)
# plt.savefig("distortion_computation.png", dpi = 300, bbox_inches = "tight")
def print_distances_squared(data, centers):
for center_num in range(len(centers)):
data_in_this_cluster = data[data["cluster"] == center_num]
print(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]])**2)
print_distances_squared(iris_small, new_centers2)
73 0.837778 129 0.121111 143 0.547778 dtype: float64 47 4.765 74 0.845 67 0.425 89 0.425 dtype: float64
inertia(iris_small, new_centers2)
6.409399633729917
distortion(iris_small, new_centers2)
1.7693026035868602
iris_small
sepal_length | sepal_width | petal_length | petal_width | dist1 | dist2 | cluster | dist0 | dist3 | |
---|---|---|---|---|---|---|---|---|---|
47 | 4.6 | 3.2 | 1.4 | 0.2 | 2.182888 | 4.783274 | 1 | 4.334487 | 3.637045 |
73 | 6.1 | 2.8 | 4.7 | 1.2 | 1.274755 | 1.415877 | 0 | 0.915302 | 0.401564 |
74 | 6.4 | 2.9 | 4.3 | 1.3 | 0.919239 | 1.691724 | 1 | 1.233333 | 0.545981 |
129 | 7.2 | 3.0 | 5.8 | 1.6 | 2.438237 | 0.508523 | 0 | 0.348010 | 1.042109 |
67 | 5.8 | 2.7 | 4.1 | 1.0 | 0.651920 | 2.017764 | 1 | 1.535506 | 0.888636 |
89 | 5.5 | 2.5 | 4.0 | 1.3 | 0.651920 | 1.960509 | 1 | 1.520234 | 0.814145 |
143 | 6.8 | 3.2 | 5.9 | 2.3 | 2.797320 | 0.222950 | 0 | 0.740120 | 1.340931 |
c1.coordinates = [1.2, 0.15]
c2.coordinates = [4.906000000000001, 1.6760000000000006]
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig("intuitive_clustering.png", dpi=300, bbox_inches = "tight")