import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from ds100_utils import *
np.random.seed(23) #kallisti
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
sns.set()
We will be using the Fashion-MNIST dataset, which is a cool little dataset with gray scale 28x28 images of articles of clothing.
Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms. Han Xiao, Kashif Rasul, Roland Vollgraf. arXiv:1708.07747 https://github.com/zalandoresearch/fashion-mnist
import fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
Downloading... Done! Downloading... Done! Downloading... Done! Downloading... Done!
Truncate Dataset: For the purposes of this demo, we're going to randomly sample
rng = np.random.default_rng(42)
n_train, n_test = 10000, 1000
train_samples = rng.choice(np.arange(len(train_images)), size=n_train, replace=False)
test_samples = rng.choice(np.arange(len(test_images)), size=n_test, replace=False)
train_images, train_labels = train_images[train_samples,:,:], train_labels[train_samples]
test_images, test_labels = test_images[test_samples,:,:], test_labels[test_samples]
train_images.shape, test_images.shape
((10000, 28, 28), (1000, 28, 28))
class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
class_dict = {i:class_name for i,class_name in enumerate(class_names)}
def show_train_image(index):
plt.figure()
# cmap=plt.cm.binary allows us to show the picture in grayscale
plt.imshow(train_images[index], cmap=plt.cm.binary)
plt.title(class_names[train_labels[index]])
plt.colorbar() # adds a bar to the side with values
plt.show()
# Simply run this cell
show_train_image(0)
Let's see what kind of images we have overall.
There are 10 classes:
# there are 10 classes
print(len(class_names))
print(sorted(class_names))
10 ['Ankle boot', 'Bag', 'Coat', 'Dress', 'Pullover', 'Sandal', 'Shirt', 'Sneaker', 'T-shirt/top', 'Trouser']
# Simply run this cell
# see documentation for subplot here:
# https://matplotlib.org/3.2.1/api/_as_gen/matplotlib.pyplot.subplot.html
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(train_images[i], cmap=plt.cm.binary)
plt.xlabel(class_names[train_labels[i]])
Suppose we would like to train a logistic regression classifier to distinguish between two specific classes of clothes. Note that logistic regression is a binary classifier; if you are interested in multi-class classification beyond this course, check out this sklearn page.
We'd then like to check out two things:
# just run this cell
train_images = train_images/255
test_images = test_images/255
print(f'Train Min:{train_images.min()} Max:{train_images.max()}')
print(f'Test Min:{test_images.min()} Max:{test_images.max()}')
show_train_image(0)
Train Min:0.0 Max:1.0 Test Min:0.0 Max:1.0
Recall that logistic regression relies on our features being 1-D, i.e., a vector, because we are trying to fit the model:
$$\hat{P}_{\theta}(Y = 1 | X = x) = \sigma(x^T \theta)$$Using np.reshape
, we reshape both train and test sets and convert them to a DataFrame:
# reshape pixels
train_images_vectors = np.reshape(train_images, (len(train_images), -1))
test_images_vectors = np.reshape(test_images, (len(test_images), -1))
train_images_vectors.shape, test_images_vectors.shape
((10000, 784), (1000, 784))
# then, add class/label to DataFrame
train_df = pd.DataFrame(train_images_vectors)
train_df['label'] = train_labels
train_df['class'] = train_df['label'].map(class_dict)
# reorder columns just so it's easier on the eyes
PIXEL_COLS = train_df.columns.tolist()[:-2]
LABEL_COLS = ['label', 'class']
cols_reorder = LABEL_COLS + PIXEL_COLS
train_df = train_df[cols_reorder]
train_df
label | class | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | ... | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Trouser | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.003922 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
1 | 3 | Dress | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.243137 | 0.160784 | 0.000000 | 0.000000 | 0.007843 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
2 | 3 | Dress | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.596078 | 0.584314 | 0.203922 | 0.196078 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
3 | 7 | Sneaker | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
4 | 8 | Bag | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.576471 | 0.568627 | 0.509804 | 0.470588 | 0.556863 | 0.168627 | 0.000000 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9995 | 2 | Pullover | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.600000 | 0.223529 | 0.047059 | 0.0 | 0.0 | 0.0 |
9996 | 0 | T-shirt/top | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.000000 | 0.317647 | 0.137255 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
9997 | 0 | T-shirt/top | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.007843 | ... | 0.494118 | 0.619608 | 0.003922 | 0.000000 | 0.011765 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
9998 | 6 | Shirt | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
9999 | 4 | Coat | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | ... | 0.003922 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 |
10000 rows × 786 columns
# do the same for test dataset
test_df = pd.DataFrame(test_images_vectors)
test_df['label'] = test_labels
test_df['class'] = test_df['label'].map(class_dict)
cols_reorder = LABEL_COLS + PIXEL_COLS
test_df = test_df[cols_reorder]
How would we visualize how the features (i.e., pixels) change with different classes? Would we have to pick random pixels to compare? Probably not. As humans, we can visualize the difference due to higher-order shapes and interactions between the pixels.
Enter PCA.
Here I use sklearn.decomposition.PCA
which uses SVD under the hood:
Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.
Let's look at the train set. We'll run PCA to get the first 50 components:
from sklearn.decomposition import PCA
n_comps = 50
PCA_COLS = [f"pc{i+1}" for i in range(n_comps)]
pca = PCA(n_components=n_comps)
pca.fit(train_df[PIXEL_COLS])
principal_components = pca.transform(train_df[PIXEL_COLS])
# The first 50 components
principal_components.shape
(10000, 50)
Note that sklearn.decomposition.PCA
has an attribute called explained_variance_ratio_
:
Percentage of variance explained by each of the selected components.
The first 50 components account for a reasonable amount of the total variance:
pca.explained_variance_ratio_
array([0.29204903, 0.17695613, 0.0596195 , 0.05026812, 0.03859675, 0.03463695, 0.0236638 , 0.0184821 , 0.01302936, 0.01288355, 0.00984057, 0.00914692, 0.00767584, 0.00674526, 0.00612661, 0.00584359, 0.00559562, 0.00532868, 0.00458649, 0.00443497, 0.00439464, 0.00410487, 0.00386062, 0.00371124, 0.00360238, 0.00348955, 0.00331063, 0.0031289 , 0.00307494, 0.00285894, 0.00272438, 0.00265853, 0.00263831, 0.00253123, 0.00251117, 0.00238949, 0.00230739, 0.00224068, 0.00218739, 0.0020943 , 0.00204936, 0.00197996, 0.00193978, 0.00182668, 0.00175013, 0.001736 , 0.00169792, 0.00165915, 0.00161304, 0.00154818])
np.sum(pca.explained_variance_ratio_)
0.8631295806642582
The first two components account for a little less than half of variance:
# PC1, PC2 component scores
np.sum(pca.explained_variance_ratio_[:2])
0.4690051583750244
Seem reasonable? Let's check out the scree plot:
plt.plot(np.arange(n_comps)+1,
100*pca.explained_variance_ratio_,
marker='.');
plt.ylabel("% variance")
plt.xlabel("Component Number");
Visually the elbow looks closer to components 3 or 4, so we can't gather too much from the visualization, but let's try it out and see what happens:
def build_comps_df(components, label_df, colnames):
df = pd.DataFrame(data=components,
columns=colnames)
df["class"] = label_df["class"]
df["label" ] = label_df["label"]
return df
pca_df = build_comps_df(principal_components, train_df, PCA_COLS)
# plot pca, uncomment for classes.
sns.lmplot(x='pc1',
y='pc2',
data=pca_df,
fit_reg=False,
# hue='class',
height=9,
scatter_kws={"s":50,"alpha":0.2})
plt.title("PCA visualization (PC2 vs. PC1)");