Lecture 27 Supplemental Notebook¶

Data 100, Spring 2023

Acknowledgments Page

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn import datasets

import warnings
warnings.filterwarnings('ignore')
In [2]:
iris, _ = datasets.load_iris(return_X_y=True, as_frame=True)
iris.rename(columns={"sepal length (cm)": "sepal_length", "sepal width (cm)": "sepal_width", 
                     "petal length (cm)": "petal_length", "petal width (cm)": "petal_width"}, inplace=True)
iris.sample(10)
Out[2]:
sepal_length sepal_width petal_length petal_width
68 6.2 2.2 4.5 1.5
49 5.0 3.3 1.4 0.2
10 5.4 3.7 1.5 0.2
122 7.7 2.8 6.7 2.0
47 4.6 3.2 1.4 0.2
103 6.3 2.9 5.6 1.8
69 5.6 2.5 3.9 1.1
132 6.4 2.8 5.6 2.2
64 5.6 2.9 3.6 1.3
39 5.1 3.4 1.5 0.2

K-Means Clustering¶

In this section, we will use the data from the iris dataset to perform clustering using two features petal_length and petal_width. Summary of the algorithm:

  • Repeat until convergence:
    • Color points according to the closest center.
    • Move center for each color to center of points with that color.
In [3]:
sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", color="black")
plt.xlabel('x')
plt.ylabel('y');
In [4]:
class Center():
    def __init__(self, data):
        """generates a random center inside the region bounded by the data"""        
        num_dimensions = data.shape[1]
        self.coordinates = np.array([0.0] * num_dimensions)
        for i in range(num_dimensions):
            min_value = np.min(data[:, i])
            max_value = np.max(data[:, i])            
            random_value = random.uniform(min_value, max_value)            
            self.coordinates[i] = random_value
    
    def __str__(self):
        return str(self.coordinates)

    def __repr__(self):
        return repr(self.coordinates)

    def dist(self, data_point):
        return np.sqrt(np.sum((self.coordinates - data_point)**2, axis = 1))
    
    def dist_sq(self, data_point):
        return np.sum((self.coordinates - data_point)**2, axis = 1)
In [5]:
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
In [6]:
# Force coordinates from the lecture demo
c1.coordinates = np.array([2.52364007, 2.31040024])
c2.coordinates = np.array([6.53276402, 1.211463])
In [7]:
def plot_centers_and_black_data(iris, centers):
    for center in centers:
        plt.plot(center.coordinates[0], center.coordinates[1], '*', markersize = 10)    
    sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", color="black")
    plt.xlabel('petal_length')
    plt.ylabel('petal_width')
    legend_text = ['c' + str(i) for i in range(1, len(centers) + 1)]
    legend_text.append('data')
    plt.legend(legend_text)
In [8]:
plot_centers_and_black_data(iris, (c1, c2))
In [9]:
def get_cluster_number(dists):
    return np.where(dists == np.min(dists))[0][0]
In [10]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
iris.head(10)
Out[10]:
sepal_length sepal_width petal_length petal_width dist1 dist2 cluster
0 5.1 3.5 1.4 0.2 2.390890 5.231474 0
1 4.9 3.0 1.4 0.2 2.390890 5.231474 0
2 4.7 3.2 1.3 0.2 2.439484 5.329623 0
3 4.6 3.1 1.5 0.2 2.345555 5.133398 0
4 5.0 3.6 1.4 0.2 2.390890 5.231474 0
5 5.4 3.9 1.7 0.4 2.080387 4.900416 0
6 4.6 3.4 1.4 0.3 2.303101 5.213064 0
7 5.0 3.4 1.5 0.2 2.345555 5.133398 0
8 4.4 2.9 1.4 0.2 2.390890 5.231474 0
9 4.9 3.1 1.5 0.1 2.435920 5.154034 0
In [11]:
iris["cluster"].value_counts()
Out[11]:
0    79
1    71
Name: cluster, dtype: int64
In [12]:
def plot_centers_and_colorized_data(iris, centers):
    plt.figure()
    for center in centers:
        plt.plot(center.coordinates[0], center.coordinates[1], 
                 marker='*', markersize=10, linestyle="None")  
    current_palette = sns.color_palette()[0:len(centers)]
    sns.scatterplot(data = iris, x = "petal_length", y= "petal_width", hue="cluster", palette=current_palette)  
    plt.xlabel('petal_length')
    plt.ylabel('petal_width')
    legend_text = ['c' + str(i) for i in range(1, len(centers) + 1)]
    legend_text.append('data')
    plt.legend(legend_text)
In [13]:
plot_centers_and_colorized_data(iris, (c1, c2))
In [14]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [15]:
plot_centers_and_black_data(iris, (c1, c2))
In [16]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [17]:
plot_centers_and_colorized_data(iris, (c1, c2))
In [18]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [19]:
plot_centers_and_black_data(iris, (c1, c2))
In [20]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [21]:
plot_centers_and_colorized_data(iris, (c1, c2))
In [22]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [23]:
plot_centers_and_black_data(iris, (c1, c2))
In [24]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [25]:
plot_centers_and_colorized_data(iris, (c1, c2))
In [26]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [27]:
plot_centers_and_black_data(iris, (c1, c2))
In [28]:
iris["dist1"] = c1.dist(iris[["petal_length", "petal_width"]])
iris["dist2"] = c2.dist(iris[["petal_length", "petal_width"]])
iris["cluster"] = iris[["dist1", "dist2"]].apply(get_cluster_number, axis = 1)
In [29]:
plot_centers_and_colorized_data(iris, (c1, c2))
In [30]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [31]:
plot_centers_and_black_data(iris, (c1, c2))
In [32]:
average_c1_length = np.mean(iris[iris["cluster"] == 0])["petal_length"]
average_c1_width = np.mean(iris[iris["cluster"] == 0])["petal_width"]
c1.coordinates = (average_c1_length, average_c1_width)

average_c2_length = np.mean(iris[iris["cluster"] == 1])["petal_length"]
average_c2_width = np.mean(iris[iris["cluster"] == 1])["petal_width"]
c2.coordinates = (average_c2_length, average_c2_width)
In [33]:
plot_centers_and_colorized_data(iris, (c1, c2))

Example for K > 2¶

In [34]:
import copy
def compute_centers_after_N_iterations(data, column_names, centers, N):
    centers = copy.deepcopy(centers)
    
    for i in range(N):
        # Recompute clusters        
        dist_names = []
        for center_num in range(len(centers)):        
            data["dist" + str(center_num)] = centers[center_num].dist(data[column_names])
            dist_names.append("dist" + str(center_num))
        
        data["cluster"] = data[dist_names].apply(get_cluster_number, axis = 1)    
        
        # Update centers
        for center_num in range(len(centers)):
            for col_num in range(len(column_names)):
                col_name = column_names[col_num]
    
                centers[center_num].coordinates[col_num] = np.mean(data[data["cluster"] == center_num])[col_name]

    return centers
In [35]:
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c1.coordinates = np.array([2.52364007, 2.31040024])
c2.coordinates = np.array([6.53276402, 1.211463])
In [36]:
iris
Out[36]:
sepal_length sepal_width petal_length petal_width dist1 dist2 cluster
0 5.1 3.5 1.4 0.2 0.111489 3.824028 0
1 4.9 3.0 1.4 0.2 0.111489 3.824028 0
2 4.7 3.2 1.3 0.2 0.202142 3.916407 0
3 4.6 3.1 1.5 0.2 0.063233 3.732042 0
4 5.0 3.6 1.4 0.2 0.111489 3.824028 0
... ... ... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 4.230663 0.676487 1
146 6.3 2.5 5.0 1.9 3.871120 0.230631 1
147 6.5 3.0 5.2 2.0 4.094650 0.420388 1
148 6.2 3.4 5.4 2.3 4.407000 0.779445 1
149 5.9 3.0 5.1 1.8 3.921694 0.210959 1

150 rows × 7 columns

In [37]:
def inertia(data, centers):
    total_inertia = 0
    for center_num in range(len(centers)):
        data_in_this_cluster = data[data["cluster"] == center_num]        
        total_inertia += np.sum(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]]))
    return total_inertia
In [38]:
def distortion(data, centers):
    total_distortion = 0
    for center_num in range(len(centers)):
        data_in_this_cluster = data[data["cluster"] == center_num]        
        total_distortion += np.sum(centers[center_num].dist(data_in_this_cluster[["petal_length", "petal_width"]]))/len(data_in_this_cluster)
    return total_distortion
In [39]:
random.seed(25)
c1 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c2 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c3 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
c4 = Center(iris.loc[:, ['petal_length', 'petal_width']].values)
new_centers = compute_centers_after_N_iterations(iris, ['petal_length', 'petal_width'], [c1, c2, c3,