Lecture 24: Clustering – Data 100, Spring 2022¶

Notebook by Josh Hug

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
In [2]:
iris = pd.read_csv("iris.csv")
iris.sample(10)
Out[2]:
sepal_length sepal_width petal_length petal_width species
112 6.8 3.0 5.5 2.1 virginica
117 7.7 3.8 6.7 2.2 virginica
106 4.9 2.5 4.5 1.7 virginica
48 5.3 3.7 1.5 0.2 setosa
135 7.7 3.0 6.1 2.3 virginica
137 6.4 3.1 5.5 1.8 virginica
108 6.7 2.5 5.8 1.8 virginica
39 5.1 3.4 1.5 0.2 setosa
100 6.3 3.3 6.0 2.5 virginica
86 6.7 3.1 4.7 1.5 versicolor
In [3]:
iris = iris.drop("species", axis = 1)
iris.sample(10)
Out[3]:
sepal_length sepal_width petal_length petal_width
92 5.8 2.6 4.0 1.2
95 5.7 3.0 4.2 1.2
130 7.4 2.8 6.1 1.9
58 6.6 2.9 4.6 1.3
139 6.9 3.1 5.4 2.1
114 5.8 2.8 5.1 2.4
32 5.2 4.1 1.5 0.1
40 5.0 3.5 1.3 0.3
122 7.7 2.8 6.7 2.0
102 7.1 3.0 5.9 2.1
In [4]:
sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", color="black")
plt.xlabel('x')
plt.ylabel('y');
#plt.savefig('2d_data_needing_clustering.png', dpi = 300, bbox_inches = "tight")
In [5]:
class Center():
    def __init__(self, data):
        """generates a random center inside the region bounded by the data"""        
        num_dimensions = data.shape[1]
        self.coordinates = np.array([0.0] * num_dimensions)
        for i in range(num_dimensions):
            min_value = np.min(data[:, i])
            max_value = np.max(data[:, i])            
            random_value = random.uniform(min_value, max_value)            
            self.coordinates[i] = random_value
    
    def __str__(self):
        return str(self.coordinates)

    def __repr__(self):
        return repr(self.coordinates)

    def dist(self, data_point):
        return np.sqrt(np.sum((self.coordinates - data_point)**2, axis = 1))
    
    def dist_sq(self, data_point):
        return np.sum((self.coordinates - data_point)**2, axis = 1)
In [6]:
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
In [7]:
# force coordinates from the lecture demo
c1.coordinates = np.array([2.52364007, 2.31040024])
c2.coordinates = np.array([6.53276402, 1.211463])
In [8]:
def plot_centers_and_black_data(iris, centers):
    sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", color="black")
    for center in centers:
        plt.plot(center.coordinates[0], center.coordinates[1], '*', markersize = 10)    
    plt.xlabel('petal_length')
    plt.ylabel('petal_width')
    legend_text = ['c' + str(i) for i in range(1, len(centers) + 1)]
    legend_text.append('data')
    plt.legend(legend_text)
In [9]:
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_initial_placement.png', dpi = 300, bbox_inches = "tight")
In [10]:
def get_cluster_number(dists):
    return np.where(dists == np.min(dists))[0][0]
In [11]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
iris.head(10)
Out[11]:
sepal_length sepal_width petal_length petal_width dist1 dist2 cluster
0 5.1 3.5 1.4 0.2 2.390890 5.231474 0
1 4.9 3.0 1.4 0.2 2.390890 5.231474 0
2 4.7 3.2 1.3 0.2 2.439484 5.329623 0
3 4.6 3.1 1.5 0.2 2.345555 5.133398 0
4 5.0 3.6 1.4 0.2 2.390890 5.231474 0
5 5.4 3.9 1.7 0.4 2.080387 4.900416 0
6 4.6 3.4 1.4 0.3 2.303101 5.213064 0
7 5.0 3.4 1.5 0.2 2.345555 5.133398 0
8 4.4 2.9 1.4 0.2 2.390890 5.231474 0
9 4.9 3.1 1.5 0.1 2.435920 5.154034 0
In [12]:
iris["cluster"].value_counts()
Out[12]:
0    79
1    71
Name: cluster, dtype: int64
In [13]:
def plot_centers_and_colorized_data(iris, centers):
    current_palette = sns.color_palette()[0:len(centers)]
    sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", hue="cluster", palette=current_palette)  
    for center in centers:
        plt.plot(center.coordinates[0], center.coordinates[1], '*', markersize = 10)
    plt.xlabel('petal_length')
    plt.ylabel('petal_width')
    legend_text = ['c' + str(i) for i in range(1, len(centers) + 1)]
    plt.legend(legend_text)
In [14]:
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_initial_placement_colored.png', dpi = 300, bbox_inches = "tight")
In [15]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [16]:
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_2.png', dpi = 300, bbox_inches = "tight")
In [17]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [18]:
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_2_colorized.png', dpi = 300, bbox_inches = "tight")
In [19]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [20]:
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_3.png', dpi = 300, bbox_inches = "tight")
In [21]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [22]:
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_3_colorized.png', dpi = 300, bbox_inches = "tight")
In [23]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [24]:
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_4.png', dpi = 300, bbox_inches = "tight")
In [25]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [26]:
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_4_colorized.png', dpi = 300, bbox_inches = "tight")
In [27]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [28]:
plot_centers_and_black_data(iris, (c1, c2))
#plt.savefig('2means_demo_center_position_5.png', dpi = 300, bbox_inches = "tight")
In [29]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [30]:
plot_centers_and_colorized_data(iris, (c1, c2))
# plt.savefig('2means_demo_center_position_5_colorized.png', dpi = 300, bbox_inches = "tight")
In [31]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [32]:
plot_centers_and_black_data(iris, (c1, c2))
# plt.savefig('2means_demo_center_position_6.png', dpi = 300, bbox_inches = "tight")
In [33]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [34]:
plot_centers_and_colorized_data(iris, (c1, c2))
# plt.savefig('2means_demo_center_position_6_colorized.png', dpi = 300, bbox_inches = "tight")

Example for K > 2¶

In [35]:
import copy
def compute_centers_after_N_iterations(data, column_names, centers, N):
    centers = copy.deepcopy(centers)
    
    for i in range(N):
        #recompute clusters        
        dist_names = []
        for center_num in range(len(centers)):        
            data["dist" + str(center_num)] = centers[center_num].dist(data[column_names])
            dist_names.append("dist" + str(center_num))
        
        data["cluster"] = data[dist_names].apply(get_cluster_number, axis = 1)    
        
        #update centers
        for center_num in range(len(centers)):
            for col_num in range(len(column_names)):
                col_name = column_names[col_num]
    
                centers[center_num].coordinates[col_num] = np.mean(data[data["cluster"] == center_num])[col_name]

    return centers
In [36]:
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c1.coordinates = np.array([2.52364007, 2.31040024])
c2.coordinates = np.array([6.53276402, 1.211463])
In [37]:
iris
Out[37]:
sepal_length sepal_width petal_length petal_width dist1 dist2 cluster
0 5.1 3.5 1.4 0.2 0.112040 3.824028 0
1 4.9 3.0 1.4 0.2 0.112040 3.824028 0
2 4.7 3.2 1.3 0.2 0.203412 3.916407 0
3 4.6 3.1 1.5 0.2 0.061068 3.732042 0
4 5.0 3.6 1.4 0.2 0.112040 3.824028 0
... ... ... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 4.229889 0.676487 1
146 6.3 2.5 5.0 1.9 3.870173 0.230631 1
147 6.5 3.0 5.2 2.0 4.093707 0.420388 1
148 6.2 3.4 5.4 2.3 4.406168 0.779445 1
149 5.9 3.0 5.1 1.8 3.920660 0.210959 1

150 rows × 7 columns

In [38]:
def inertia(data, centers):
    total_inertia = 0
    for center_num in range(len(centers)):
        data_in_this_cluster = data[data["cluster"] == center_num]        
        total_inertia += np.sum(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]]))
    return total_inertia
In [39]:
def distortion(data, centers):
    total_distortion = 0
    for center_num in range(len(centers)):
        data_in_this_cluster = data[data["cluster"] == center_num]        
        total_distortion += np.sum(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]]))/len(data_in_this_cluster)
    return total_distortion
In [40]:
random.seed(25)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example1.png", bbox_inches = "tight", dpi=300)
inertia: 44.95723374524917, distortion: 1.254497095901379)
In [41]:
random.seed(29)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example2.png", bbox_inches = "tight", dpi=300)
inertia: 45.94868633864745, distortion: 1.3083110705058751)
In [42]:
random.seed(40)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example3.png", bbox_inches = "tight", dpi=300)
inertia: 54.34774261570242, distortion: 1.50090424867691)
In [43]:
random.seed(75)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3, c4], 12)
print(f"inertia: {inertia(iris, new_centers)}, distortion: {distortion(iris, new_centers)})")
plot_centers_and_colorized_data(iris, new_centers)
# plt.savefig("k4_example4.png", bbox_inches = "tight", dpi=300)
inertia: 44.95723374524917, distortion: 1.2544970959013793)
In [44]:
random.seed(20)
np.random.seed(20)
iris_small = iris.sample(7)
c1 = Center(iris_small.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris_small.loc[:, ['petal_length', 'petal_width']].values)
new_centers2 = compute_centers_after_N_iterations(iris_small, ['petal_length', 'petal_width'], [c1, c2], 12)
plot_centers_and_colorized_data(iris_small, new_centers2)
# plt.savefig("distortion_computation.png", dpi = 300, bbox_inches = "tight")
In [45]:
def print_distances_squared(data, centers):
    for center_num in range(len(centers)):
        data_in_this_cluster = data[data["cluster"] == center_num]        
        print(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]])**2)

print_distances_squared(iris_small, new_centers2)
73     0.837778
129    0.121111
143    0.547778
dtype: float64
47    4.765
74    0.845
67    0.425
89    0.425
dtype: float64
In [46]:
inertia(iris_small, new_centers2)
Out[46]:
6.409399633729917
In [47]:
distortion(iris_small, new_centers2)
Out[47]:
1.7693026035868602
In [48]:
iris_small
Out[48]:
sepal_length sepal_width petal_length petal_width dist1 dist2 cluster dist0 dist3
47 4.6 3.2 1.4 0.2 2.182888 4.783274 1 4.334487 3.637045
73 6.1 2.8 4.7 1.2 1.274755 1.415877 0 0.915302 0.401564
74 6.4 2.9 4.3 1.3 0.919239 1.691724 1 1.233333 0.545981
129 7.2 3.0 5.8 1.6 2.438237 0.508523 0 0.348010 1.042109
67 5.8 2.7 4.1 1.0 0.651920 2.017764 1 1.535506 0.888636
89 5.5 2.5 4.0 1.3 0.651920 1.960509 1 1.520234 0.814145
143 6.8 3.2 5.9 2.3 2.797320 0.222950 0 0.740120 1.340931

Example of Inertia Failing to Match Intuition¶

In [49]:
c1.coordinates = [1.2, 0.15]
c2.coordinates = [4.906000000000001, 1.6760000000000006]
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [50]:
plot_centers_and_colorized_data(iris, (c1, c2))
#plt.savefig("intuitive_clustering.png", dpi=300, bbox_inches = "tight")