import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from ds100_utils import *

np.random.seed(23) #kallisti

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
sns.set()


import fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

Using cached version that was downloaded (UTC): Thu Aug  3 21:04:31 2023
Using cached version that was downloaded (UTC): Thu Aug  3 21:04:31 2023
Using cached version that was downloaded (UTC): Thu Aug  3 21:04:30 2023
Using cached version that was downloaded (UTC): Thu Aug  3 21:04:30 2023


rng = np.random.default_rng(42)
n_train, n_test = 10000, 1000
train_samples = rng.choice(np.arange(len(train_images)), size=n_train, replace=False)
test_samples = rng.choice(np.arange(len(test_images)), size=n_test, replace=False)

train_images, train_labels = train_images[train_samples,:,:], train_labels[train_samples]
test_images, test_labels = test_images[test_samples,:,:], test_labels[test_samples]

train_images.shape, test_images.shape

((10000, 28, 28), (1000, 28, 28))


class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
class_dict = {i:class_name for i,class_name in enumerate(class_names)}

def show_train_image(index):
    plt.figure()
    # cmap=plt.cm.binary allows us to show the picture in grayscale
    plt.imshow(train_images[index], cmap=plt.cm.binary)
    plt.title(class_names[train_labels[index]])
    plt.colorbar() # adds a bar to the side with values
    plt.show()


# Simply run this cell
show_train_image(0)


# there are 10 classes
print(len(class_names))
print(sorted(class_names))

10
['Ankle boot', 'Bag', 'Coat', 'Dress', 'Pullover', 'Sandal', 'Shirt', 'Sneaker', 'T-shirt/top', 'Trouser']


# Simply run this cell
# see documentation for subplot here:
# https://matplotlib.org/3.2.1/api/_as_gen/matplotlib.pyplot.subplot.html
plt.figure(figsize=(10,10))
for i in range(25):
    plt.subplot(5,5,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(train_images[i], cmap=plt.cm.binary)
    plt.xlabel(class_names[train_labels[i]])


# just run this cell
train_images = train_images/255
test_images = test_images/255

print(f'Train Min:{train_images.min()} Max:{train_images.max()}')

print(f'Test Min:{test_images.min()} Max:{test_images.max()}')

show_train_image(0)

Train Min:0.0 Max:1.0
Test Min:0.0 Max:1.0


# reshape pixels
train_images_vectors = np.reshape(train_images, (len(train_images), -1))
test_images_vectors = np.reshape(test_images, (len(test_images), -1))
train_images_vectors.shape, test_images_vectors.shape

((10000, 784), (1000, 784))


# then, add class/label to DataFrame
train_df = pd.DataFrame(train_images_vectors)
train_df['label'] = train_labels
train_df['class'] = train_df['label'].map(class_dict)

# reorder columns just so it's easier on the eyes
PIXEL_COLS = train_df.columns.tolist()[:-2]
LABEL_COLS = ['label', 'class']

cols_reorder = LABEL_COLS + PIXEL_COLS
train_df = train_df[cols_reorder]
train_df


# do the same for test dataset
test_df = pd.DataFrame(test_images_vectors)
test_df['label'] = test_labels
test_df['class'] = test_df['label'].map(class_dict)

cols_reorder = LABEL_COLS + PIXEL_COLS
test_df = test_df[cols_reorder]


from sklearn.decomposition import PCA

n_comps = 50
PCA_COLS = [f"pc{i+1}" for i in range(n_comps)]
pca = PCA(n_components=n_comps)
pca.fit(train_df[PIXEL_COLS])
principal_components = pca.transform(train_df[PIXEL_COLS])


# The first 50 components
principal_components.shape

(10000, 50)


pca.explained_variance_ratio_

array([0.29204903, 0.17695613, 0.0596195 , 0.05026812, 0.03859675,
       0.03463695, 0.0236638 , 0.0184821 , 0.01302936, 0.01288355,
       0.00984057, 0.00914692, 0.00767584, 0.00674526, 0.00612661,
       0.00584359, 0.00559562, 0.00532868, 0.00458649, 0.00443497,
       0.00439464, 0.00410487, 0.00386062, 0.00371124, 0.00360238,
       0.00348955, 0.00331063, 0.0031289 , 0.00307494, 0.00285894,
       0.00272438, 0.00265853, 0.00263831, 0.00253123, 0.00251117,
       0.00238949, 0.00230739, 0.00224068, 0.00218739, 0.0020943 ,
       0.00204936, 0.00197996, 0.00193978, 0.00182668, 0.00175013,
       0.001736  , 0.00169792, 0.00165915, 0.00161304, 0.00154818])


np.sum(pca.explained_variance_ratio_)

0.8631295806642588


# PC1, PC2 component scores
np.sum(pca.explained_variance_ratio_[:2])

0.46900515837502466


plt.plot(np.arange(n_comps)+1,
         100*pca.explained_variance_ratio_,
         marker='.');
plt.ylabel("% variance")
plt.xlabel("Component Number");


def build_comps_df(components, label_df, colnames):
    df = pd.DataFrame(data=components,
                      columns=colnames)
    df["class"] = label_df["class"]
    df["label" ] = label_df["label"]
    return df


pca_df = build_comps_df(principal_components, train_df, PCA_COLS)
# plot pca, uncomment for classes.
sns.lmplot(x='pc1',
           y='pc2',
           data=pca_df, 
           fit_reg=False, 
           # hue='class',
           height=9,
           scatter_kws={"s":50,"alpha":0.2})
plt.title("PCA visualization (PC2 vs. PC1)");


from sklearn.manifold import TSNE


# Simply run this cell
TSNE_COLS = ["z1", "z2"]

tsne_model = TSNE(n_components=2, random_state=0, perplexity=30, learning_rate=200, n_iter=1000)


tsne_comps = tsne_model.fit_transform(principal_components)


tsne_df = build_comps_df(tsne_comps, train_df, TSNE_COLS)

# t-SNE embeddings separate features.
# uncomment to see how these feature clusters correspond to class labels
sns.lmplot(x='z1',
           y='z2',
           data=tsne_df, 
           fit_reg=False, 
           # hue='class',
           height=9,
           scatter_kws={"s":50,"alpha":0.4})
plt.title("t-SNE embedding visualization");


def get_data_subset_binary(df, class0, class1, featnames, shuffle=True, shuffle_seed=42):
    df_filtered = df[(df["class"] == class0) | (df["class"] == class1)].copy()
    df_filtered["binary_label"] = 0
    df_filtered.loc[df["class"] == class1, "binary_label"] = 1
    data = df_filtered[featnames].values
    labels = df_filtered["binary_label"].values
    return data, labels


data_bottoms, labels_bottoms = get_data_subset_binary(pca_df, "Trouser", "Ankle boot", PCA_COLS)
print(len(data_bottoms), "datapoints")

1972 datapoints


from sklearn.linear_model import LogisticRegression

model_bottoms = LogisticRegression(random_state=42, solver="sag")


model_bottoms.fit(data_bottoms, labels_bottoms)

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(

LogisticRegression(random_state=42, solver='sag')

LogisticRegression(random_state=42, solver='sag')


from sklearn.metrics import accuracy_score

preds_bottoms = model_bottoms.predict(data_bottoms)
accuracy_score(labels_bottoms, preds_bottoms)

1.0


# plot pca, uncomment for classes.
visualize_df = pd.DataFrame(
    {"pc1": data_bottoms[:,0],
     "pc2": data_bottoms[:,1],
     "label": labels_bottoms
    })
sns.lmplot(x="pc1",
           y="pc2",
           fit_reg=False,
           data=visualize_df,
           hue="label",
           height=4,
           scatter_kws={"s":50,"alpha":0.2})
plt.title("PCA visualization (PC2 vs. PC1)");


# 1. rotate test data to "principal components" space
principal_components = pca.transform(test_df[PIXEL_COLS])
pca_test_df = build_comps_df(principal_components, test_df, PCA_COLS)

# 2. subset with target binary labels 
data_bottoms_test, labels_bottoms_test = \
    get_data_subset_binary(pca_test_df, "Trouser", "Ankle boot", PCA_COLS)
print(len(data_bottoms_test), "datapoints")

209 datapoints


# 3. predict using logistic regression
preds_bottoms_test = model_bottoms.predict(data_bottoms_test)
accuracy_score(labels_bottoms_test, preds_bottoms_test)

1.0


data_tops, labels_tops = get_data_subset_binary(pca_df, "Pullover", "Coat", PCA_COLS)
print(len(data_tops), "datapoints")

2003 datapoints


from sklearn.linear_model import LogisticRegression

model_tops = LogisticRegression(verbose=False, random_state=42, solver="sag")
model_tops.fit(data_tops, labels_tops)

LogisticRegression(random_state=42, solver='sag', verbose=False)

LogisticRegression(random_state=42, solver='sag', verbose=False)


preds_tops = model_tops.predict(data_tops)
accuracy_score(labels_tops, preds_tops)

0.8642036944583126


# 1. (done earlier) rotate test data to "principal components" space
# 2. subset with target binary labels 
data_tops_test, labels_tops_test = \
    get_data_subset_binary(pca_test_df, "Pullover", "Coat", PCA_COLS)
print(len(data_tops_test), "datapoints")

190 datapoints


# 3. predict using logistic regression
preds_tops_test = model_tops.predict(data_tops_test)
accuracy_score(labels_tops_test, preds_tops_test)

0.8736842105263158


# plot pca, uncomment for classes.
visualize_df = pd.DataFrame(
    {"pc1": data_tops[:,0],
     "pc2": data_tops[:,1],
     "label": labels_tops
    })
sns.lmplot(x="pc1",
           y="pc2",
           fit_reg=False,
           data=visualize_df,
           hue="label",
           height=4,
           scatter_kws={"s":50,"alpha":0.2})
plt.title("PCA visualization (PC2 vs. PC1) (test)");


data3, labels3 = get_data_subset_binary(train_df, "Trouser", "Ankle boot", PIXEL_COLS)
print(len(data3), "datapoints")

1972 datapoints


from sklearn.linear_model import LogisticRegression

model3 = LogisticRegression(verbose=False, random_state=42, solver="sag", max_iter=30)
model3.fit(data3, labels3)

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/linear_model/_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge
  warnings.warn(

LogisticRegression(max_iter=30, random_state=42, solver='sag', verbose=False)

LogisticRegression(max_iter=30, random_state=42, solver='sag', verbose=False)


preds3 = model3.predict(data3)
accuracy_score(labels3, preds3)

1.0


data3_test, labels3_test = get_data_subset_binary(train_df, "Pullover", "Coat", PIXEL_COLS)
print(len(data3_test), "datapoints")

preds3 = model3.predict(data3)
accuracy_score(labels3, preds3)

2003 datapoints

1.0

	label	class	0	1	2	3	4	5	6	7	...	774	775	776	777	778	779	780	781	782	783
0	1	Trouser	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.003922	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0
1	3	Dress	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.243137	0.160784	0.000000	0.000000	0.007843	0.000000	0.000000	0.0	0.0	0.0
2	3	Dress	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.596078	0.584314	0.203922	0.196078	0.000000	0.000000	0.000000	0.0	0.0	0.0
3	7	Sneaker	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0
4	8	Bag	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.576471	0.568627	0.509804	0.470588	0.556863	0.168627	0.000000	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9995	2	Pullover	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.600000	0.223529	0.047059	0.0	0.0	0.0
9996	0	T-shirt/top	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.000000	0.317647	0.137255	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0
9997	0	T-shirt/top	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.007843	...	0.494118	0.619608	0.003922	0.000000	0.011765	0.000000	0.000000	0.0	0.0	0.0
9998	6	Shirt	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0
9999	4	Coat	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.000000	...	0.003922	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.0	0.0	0.0

Fashion MNIST using PCA¶

Load the Fashion-MNIST dataset¶

Load data¶

Visualizing images¶

Goals of this demo¶

Preprocess:¶

Normalize to 1¶

Reshape features into 1-D¶

PCA¶

Explained Variance from PCs¶

EDA: visualizations¶

Optional: t-SNE: a random embedding¶

Logistic Regression¶

Classifier 1: Trouser vs. Ankle boot¶

Classifier 1: Linearly separable?¶

Classifier 1: Test performance¶

Classifier 2: Pullover vs. Coat¶

Classifier 2: Linearly Separable?¶

Classifier 3: Original Pixels, Trousers vs. Ankle Boots¶