import pandas as pd
import numpy as np
from ds100_utils import *
import plotly.express as px

import fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
print("Training images", train_images.shape)
print("Test images", test_images.shape)

Using cached version that was downloaded (UTC): Thu Apr 18 05:53:05 2024
Using cached version that was downloaded (UTC): Thu Apr 18 05:53:06 2024
Using cached version that was downloaded (UTC): Thu Apr 18 05:53:06 2024
Using cached version that was downloaded (UTC): Thu Apr 18 05:53:06 2024
Training images (60000, 28, 28)
Test images (10000, 28, 28)

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
class_dict = {i:class_name for i,class_name in enumerate(class_names)}

rng = np.random.default_rng(42)
n = 5000
sample_idx = rng.choice(np.arange(len(train_images)), size=n, replace=False)

# Invert and normalize the images so they look better
img_mat = -1*train_images[sample_idx]
img_mat = (img_mat - img_mat.min())/(img_mat.max() - img_mat.min())

images = pd.DataFrame({"images": img_mat.tolist(), 
                   "labels": train_labels[sample_idx], 
                   "class": [class_dict[x] for x in train_labels[sample_idx]]})
images.head()

def show_images(images, ncols=5, max_images=30):
    # conver the subset of images into a n,28,28 matrix for facet visualization
    img_mat = np.array(images.head(max_images)['images'].to_list())
    fig = px.imshow(img_mat, color_continuous_scale='gray', 
                    facet_col = 0, facet_col_wrap=ncols,
                    height = 220*int(np.ceil(len(images)/ncols)))
    fig.update_layout(coloraxis_showscale=False)
    # Extract the facet number and convert it back to the class label.
    fig.for_each_annotation(lambda a: a.update(text=images.iloc[int(a.text.split("=")[-1])]['class']))
    return fig

show_images(images.head(20))

show_images(images.groupby('class',as_index=False).sample(2), ncols=6)

X = np.array(images['images'].to_list())
X.shape

(5000, 28, 28)

X = X.reshape(X.shape[0], -1)
X.shape

(5000, 784)

X = X - X.mean(axis=0)

from sklearn.decomposition import PCA
n_comps = 50 
pca = PCA(n_components=n_comps)
pca.fit(X)

PCA(n_components=50)

PCA(n_components=50)

# make a line plot and show markers
px.line(y=pca.explained_variance_ratio_ *100, markers=True)

images[['z1', 'z2', 'z3']] = pca.transform(X)[:, :3]

px.scatter(images, x='z1', y='z2', hover_data=['labels'], 
           width = 800, height = 800)

px.scatter(images, x='z1', y='z2', color='class', hover_data=['labels'], 
           width = 800, height = 800)

fig = px.scatter_3d(images, x='z1', y='z2', z='z3', color='class', hover_data=['labels'], 
              width=1000, height=800)
# set marker size to 5
fig.update_traces(marker=dict(size=5))

rand_basis = np.random.randn(784, 3)
images[['z1_rand', 'z2_rand', 'z3_rand']] = X @ rand_basis
px.scatter_3d(images, x='z1_rand', y='z2_rand', z='z3_rand', color='class', hover_data=['labels'],  
              width=1000, height=800).update_traces(marker=dict(size=5))

from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=3, random_state=0, perplexity=30, learning_rate=200, n_iter=1000)
tsne_comps = tsne_model.fit_transform(X)
images[['tsne1', 'tsne2', 'tsne3']] = tsne_comps

px.scatter(images, x='tsne1', y='tsne2', color='class', hover_data=['labels'],
              width=1000, height=800)

px.scatter_3d(images, x='tsne1', y='tsne2', z='tsne3', color='class', hover_data=['labels'],
              width=1000, height=800).update_traces(marker=dict(size=5))

classes = ['Coat', 'Pullover']
tough_images = images[images['class'].isin(classes)].copy()
show_images(tough_images.sample(20))

X = np.array(tough_images['images'].to_list())
X = X.reshape(X.shape[0], -1)
X = X - X.mean(axis=0)
zs = PCA(n_components=3).fit_transform(X)
tough_images[['z1', 'z2', 'z3']] = zs
px.scatter_3d(tough_images, x='z1', y='z2', z='z3', color='class', hover_data=['labels'],
              width=1000, height=800).update_traces(marker=dict(size=5))

import sklearn.linear_model as lm
model = lm.LogisticRegression(max_iter=1000)
y = tough_images['class'] == "Coat"
model.fit(zs, y)
np.mean(model.predict(zs) == y)

0.6905444126074498

import sklearn.linear_model as lm
model = lm.LogisticRegression(max_iter=1000)
y = tough_images['class'] == "Coat"
model.fit(X, y)
np.mean(model.predict(X) == y)

0.9551098376313276

Lecture 25 – Data 100, Spring 2024¶

Load the Fashion-MNIST dataset¶

Load data¶

Visualizing images¶

PCA¶

Examining PCA Results¶

Trying other methods¶

Apply PCA to a subset of the data¶

Logistic Regression on these hard images¶

	images	labels	class
0	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	3	Dress
1	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	4	Coat
2	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	0	T-shirt/top
3	[[1.0, 1.0, 1.0, 1.0, 1.0, 0.996078431372549, ...	2	Pullover
4	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	1	Trouser