import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import yaml
from datetime import datetime
from ds100_utils import *
import plotly.express as px

rectangle = pd.read_csv("data/rectangle_data.csv")
rectangle

X = rectangle - np.mean(rectangle, axis = 0)
X.head(10)

X = X / np.std(X, axis = 0)

U, S, Vt = np.linalg.svd(X, full_matrices = False)

print("Shape of U", U.shape)
print("Shape of S", S.shape)
print("Shape of Vt", Vt.shape)

Shape of U (100, 4)
Shape of S (4,)
Shape of Vt (4, 4)

Sm = np.diag(S)
Sm

array([[1.69892862e+01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.01345580e+01, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.94191917e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.64206029e-15]])

np.isclose(S[3], 0)

True

S.round(5)

array([16.98929, 10.13456,  2.94192,  0.     ])

pd.DataFrame(np.round(np.diag(S),3))

pd.DataFrame(np.round(S**2 / X.shape[0], 3))

Z = U[:, :2] @ np.diag(S[:2])
pd.DataFrame(Z).head()

Z = X.to_numpy() @ Vt.T[:,:2]
pd.DataFrame(Z).head()

px.scatter(x=Z[:, 0], y=Z[:, 1], render_mode="svg")

from sklearn.decomposition import PCA
pca = PCA(2)
pd.DataFrame(pca.fit_transform(X)).head(5)

pd.DataFrame(Z).head()

pd.DataFrame(pca.fit_transform(X)).head(5)

pd.DataFrame(np.cov(Z.T))

rectangle.head()

k = 2
U, S, Vt = np.linalg.svd(X, full_matrices = False)

## Construct the latent factors
Z = U[:,:k] @ np.diag(S[:k])

## Reconstruct the original rectangle using the factors Z and the principle components
rectangle_hat = pd.DataFrame(Z @ Vt[:k, :], columns = rectangle.columns)

## Scale and shift the factors back to the original coordinate system
rectangle_hat = rectangle_hat * np.std(rectangle, axis = 0) + np.mean(rectangle, axis = 0)


## Plot the data
fig = px.scatter_3d(rectangle, x="width", y="height", z="area",
                    width=800, height=600)
fig.add_scatter3d(x=rectangle_hat["width"], 
                  y=rectangle_hat["height"], 
                  z=rectangle_hat["area"], 
                  mode="markers", name = "approximation")

# February 2019 House of Representatives roll call votes
# Downloaded using https://github.com/eyeseast/propublica-congress
votes = pd.read_csv('data/votes.csv')
votes = votes.astype({"roll call": str}) 
votes

def was_yes(s):
    return 1 if s.iloc[0] == "Yes" else 0    
vote_pivot = votes.pivot_table(index='member', 
                                columns='roll call', 
                                values='vote', 
                                aggfunc=was_yes, 
                                fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()

(441, 41)

vote_pivot_centered = vote_pivot - np.mean(vote_pivot, axis = 0)
vote_pivot_centered

vote_pivot_centered.shape

(441, 41)

u, s, vt = np.linalg.svd(vote_pivot_centered, full_matrices = False)

print("u.shape", u.shape)
print("s.shape", s.shape)
print("vt.shape", vt.shape)

u.shape (441, 41)
s.shape (41,)
vt.shape (41, 41)

vote_2d = pd.DataFrame(index = vote_pivot_centered.index)
vote_2d[["z1", "z2", "z3"]] = (u * s)[:, :3]
px.scatter(vote_2d, x='z1', y='z2', title='Vote Data', width=800, height=600, render_mode="svg")

np.round(s**2 / sum(s**2), 2)

array([0.8 , 0.05, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

fig = px.line(y=s**2 / sum(s**2), title='Variance Explained', width=700, height=400, markers=True)
fig.update_xaxes(title_text='Principal Component')
fig.update_yaxes(title_text='Proportion of Variance Explained')

fig = px.scatter_3d(vote_2d, x='z1', y='z2', z='z3', title='Vote Data', width=800, height=600)
fig.update_traces(marker=dict(size=5))

# You can get current information about legislators with this code. In our case, we'll use
# a static copy of the 2019 membership roster to properly match our voting data.

# base_url = 'https://raw.githubusercontent.com/unitedstates/congress-legislators/main/'
# legislators_path = 'legislators-current.yaml'
# f = fetch_and_cache(base_url + legislators_path, legislators_path)

# Use 2019 data copy
legislators_data = yaml.safe_load(open('data/legislators-2019.yaml'))

def to_date(s):
    return datetime.strptime(s, '%Y-%m-%d')

legs = pd.DataFrame(
    columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
    data=[[x['id']['bioguide'], 
           x['name']['first'],
           x['name']['last'],
           x['bio']['gender'],
           x['terms'][-1]['state'],
           x['terms'][-1]['type'],
           x['terms'][-1]['party'],
           to_date(x['bio']['birthday'])] for x in legislators_data])
legs['age'] = 2024 - legs['birthday'].dt.year
legs.set_index("leg_id")
legs.sort_index()

vote_2d = vote_2d.join(legs.set_index('leg_id')).dropna()

px.scatter(vote_2d, x='z1', y='z2', color='party', symbol="gender", size='age',
           title='Vote Data', width=800, height=600, size_max=10,
           opacity = 0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'],
           render_mode="svg")

np.random.seed(42)
vote_2d['z1_jittered'] = vote_2d['z1'] + np.random.normal(0, 0.1, len(vote_2d))
vote_2d['z2_jittered'] = vote_2d['z2'] + np.random.normal(0, 0.1, len(vote_2d))
vote_2d['z3_jittered'] = vote_2d['z3'] + np.random.normal(0, 0.1, len(vote_2d))

px.scatter(vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='age',
           title='Vote Data', width=800, height=600, size_max=10,
           opacity = 0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'])

px.scatter_3d(
    vote_2d, x='z1_jittered', y='z2_jittered', z='z3_jittered', 
    color='party', symbol="gender", size='age',
    title='Vote Data', width=800, height=600, size_max=10,
    opacity = 0.7,
    color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
    hover_data=['first', 'last', 'state', 'party', 'gender', 'age']
        )

vote_2d["num votes"] = (
    votes[votes["vote"].isin(["Yes", "No"])]
        .groupby("member").size()
)
vote_2d.dropna(inplace=True)
vote_2d.head()

# histogram with a jittered marginal
px.histogram(vote_2d, x="num votes", log_x=True, width=800, height=600)

px.scatter(vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='num votes',
           title='Vote Data (Size is Number of Votes)', width=800, height=600, size_max=10,
           opacity = 0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'], 
           render_mode="svg")

fig_eig = px.bar(x=vote_pivot_centered.columns, y=vt[0,:])
# extract the trace from the figure
fig_eig

party_line_votes = (
    vote_pivot_centered.join(legs.set_index("leg_id")['party'])
                       .groupby("party").mean()
                       .T.reset_index()
                       .rename(columns={"index": "call"})
                       .melt("call")
)
fig = px.bar(
    party_line_votes,
    x="call", y="value", facet_row = "party", color="party",
    color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"})
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig_eig

loadings = pd.DataFrame(
    {
    "pc1": np.sqrt(s[0]) * vt[0,:], 
    "pc2": np.sqrt(s[1]) * vt[1,:]
    }, 
    index=vote_pivot_centered.columns)   
loadings.head()

fig = px.scatter(
    vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='num votes',
    title='Biplot', width=800, height=600, size_max=10,
    opacity = 0.7,
    color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
    hover_data=['first', 'last', 'state', 'party', 'gender', 'age'],
    render_mode="svg")

for (call, pc1, pc2) in loadings.head(50).itertuples():
    fig.add_scatter(x=[0,pc1], y=[0,pc2], name=call, 
                    mode='lines+markers', textposition='top right',
                    marker= dict(size=10,symbol= "arrow-bar-up", angleref="previous"))
fig

import fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
print("Training images", train_images.shape)
print("Test images", test_images.shape)

Downloading... Done!
Downloading... Done!
Downloading... Done!
Downloading... Done!
Training images (60000, 28, 28)
Test images (10000, 28, 28)

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
class_dict = {i:class_name for i, class_name in enumerate(class_names)}

rng = np.random.default_rng(42)
n = 5000
sample_idx = rng.choice(np.arange(len(train_images)), size=n, replace=False)

# Invert and normalize the images so they look better
img_mat = -1. * train_images[sample_idx]
img_mat = (img_mat - img_mat.min())/(img_mat.max() - img_mat.min())

images = pd.DataFrame({"images": img_mat.tolist(), 
                   "labels": train_labels[sample_idx], 
                   "class": [class_dict[x] for x in train_labels[sample_idx]]})
images.head()

def show_images(images, ncols=5, max_images=30):
    # conver the subset of images into a n,28,28 matrix for facet visualization
    img_mat = np.array(images.head(max_images)['images'].to_list())
    fig = px.imshow(img_mat, color_continuous_scale='gray', 
                    facet_col = 0, facet_col_wrap=ncols,
                    height = 220*int(np.ceil(len(images)/ncols)))
    fig.update_layout(coloraxis_showscale=False)
    # Extract the facet number and convert it back to the class label.
    fig.for_each_annotation(lambda a: a.update(text=images.iloc[int(a.text.split("=")[-1])]['class']))
    return fig

show_images(images.head(20))

show_images(images.groupby('class',as_index=False).sample(2), ncols=6)

X = np.array(images['images'].to_list())
X.shape

(5000, 28, 28)

X = X.reshape(X.shape[0], -1)
X.shape

(5000, 784)

X = X - X.mean(axis=0)

from sklearn.decomposition import PCA
n_comps = 50 
pca = PCA(n_components=n_comps)
pca.fit(X)

PCA(n_components=50)

PCA(n_components=50)

# make a line plot and show markers
px.line(y=pca.explained_variance_ratio_ *100, markers=True)

images[['z1', 'z2', 'z3']] = pca.transform(X)[:, :3]

px.scatter(images, x='z1', y='z2', hover_data=['labels'], opacity=0.7,
           width = 800, height = 600, render_mode="svg")

px.scatter(images, x='z1', y='z2', color='class', hover_data=['labels'], opacity=0.7, 
           width = 800, height = 600, render_mode="svg")

fig = px.scatter_3d(images, x='z1', y='z2', z='z3', color='class', hover_data=['labels'], 
                    width=1000, height=600)
# set marker size to 5
fig.update_traces(marker=dict(size=3))

	width	height	area	perimeter
0	2.97	1.35	24.78	8.64
1	-3.03	-0.65	-15.22	-7.36
2	-4.03	-1.65	-20.22	-11.36
3	3.97	-1.65	3.78	4.64
4	3.97	3.35	48.78	14.64
5	-2.03	-3.65	-20.22	-11.36
6	-1.03	-2.65	-15.22	-7.36
7	0.97	0.35	6.78	2.64
8	1.97	-3.65	-16.22	-3.36
9	2.97	-2.65	-7.22	0.64

	0
0	2.886
1	1.027
2	0.087
3	0.000

	0	1
0	-2.165116	0.250465
1	1.659346	-0.502647
2	2.462645	-0.416318
3	-0.856182	1.483758
4	-3.884287	-0.182623

	0	1
0	-2.165116	0.250465
1	1.659346	-0.502647
2	2.462645	-0.416318
3	-0.856182	1.483758
4	-3.884287	-0.182623

	0	1
0	2.165116	0.250465
1	-1.659346	-0.502647
2	-2.462645	-0.416318
3	0.856182	1.483758
4	3.884287	-0.182623

Lecture 25 – Data 100, Fall 2024¶

PCA with SVD¶

Step 1: Center the Data Matrix $X$ ¶

Step 2: Get the SVD of centered $X$ ¶

Step 3 Computing Approximations to the Data¶

Using $Z = U * S$ ¶

Using $Z = X * V$ ¶

Lower Rank Approximation of X¶

Congressional Vote Records¶

PCA¶

PCA plot¶

Component Scores¶

Scree plot¶

Incorporating Member Information¶

Analysis: Regular Voters¶

Exploring the Principal Components¶

Biplot¶

Fashion-MNIST dataset¶

Load data¶

Visualizing images¶

PCA¶

Examining PCA Results¶

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

	0	1	2
0	16.989	0.000	0.000
1	0.000	10.135	0.000
2	0.000	0.000	2.942
3	0.000	0.000	0.000

	chamber	session	roll call	member	vote
0	House	1	555	A000374	Not Voting
1	House	1	555	A000370	Yes
2	House	1	555	A000055	No
3	House	1	555	A000371	Yes
4	House	1	555	A000372	No
...	...	...	...	...	...
17823	House	1	515	Y000062	Yes
17824	House	1	515	Y000065	No
17825	House	1	515	Y000033	Yes
17826	House	1	515	Z000017	Yes
17827	House	1	515	P000197	Speaker

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	1	0	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000367	0	0	0	0	0	0	0	0	0	0	...	0	1	1	1	1	0	1	1	0	1
A000369	1	1	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000370	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1
A000371	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	0.129252	-0.668934	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000367	-0.870748	-0.668934	-0.526077	-0.52381	-0.950113	-0.412698	-0.562358	-0.365079	-0.405896	-0.439909	...	-0.521542	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	-0.913832	0.496599
A000369	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000370	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
A000371	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
W000827	-0.870748	-0.668934	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	-0.954649	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
Y000033	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
Y000062	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
Y000065	-0.870748	-0.668934	-0.526077	-0.52381	-0.950113	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
Z000017	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401

	leg_id	first	last	gender	state	chamber	party	birthday	age
0	B000944	Sherrod	Brown	M	OH	sen	Democrat	1952-11-09	72
1	C000127	Maria	Cantwell	F	WA	sen	Democrat	1958-10-13	66
2	C000141	Benjamin	Cardin	M	MD	sen	Democrat	1943-10-05	81
3	C000174	Thomas	Carper	M	DE	sen	Democrat	1947-01-23	77
4	C001070	Robert	Casey	M	PA	sen	Democrat	1960-04-13	64
...	...	...	...	...	...	...	...	...	...
534	M001197	Martha	McSally	F	AZ	sen	Republican	1966-03-22	58
535	G000592	Jared	Golden	M	ME	rep	Democrat	1982-07-25	42
536	K000395	Fred	Keller	M	PA	rep	Republican	1965-10-23	59
537	B001311	Dan	Bishop	M	NC	rep	Republican	1964-07-01	60
538	M001210	Gregory	Murphy	M	NC	rep	Republican	1963-03-05	61

	z1	z2	z3	first	last	gender	state	chamber	party	birthday	age	z1_jittered	z2_jittered	z3_jittered	num votes
member
A000055	3.061356	0.364191	0.194493	Robert	Aderholt	M	AL	rep	Republican	1965-07-22	59.0	3.111028	0.358637	0.192851	40.0
A000367	0.188870	-2.433565	0.283280	Justin	Amash	M	MI	rep	Independent	1980-04-18	44.0	0.175043	-2.395158	0.402119	41.0
A000369	2.844370	0.821619	-0.476126	Mark	Amodei	M	NV	rep	Republican	1958-06-12	66.0	2.909139	0.818350	-0.223433	41.0
A000370	-2.607536	0.127977	0.031377	Alma	Adams	F	NC	rep	Democrat	1946-05-27	78.0	-2.455233	-0.078767	-0.021710	41.0
A000371	-2.607536	0.127977	0.031377	Pete	Aguilar	M	CA	rep	Democrat	1979-06-19	45.0	-2.630951	0.119065	-0.017567	41.0

	pc1	pc2
roll call
515	-0.215368	1.147391
516	-0.846835	0.930164
517	-1.375641	0.180715
518	-1.371606	0.197301
519	-0.039021	0.827081

	images	labels	class
0	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	3	Dress
1	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	4	Coat
2	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	0	T-shirt/top
3	[[1.0, 1.0, 1.0, 1.0, 1.0, 0.996078431372549, ...	2	Pullover
4	[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...	1	Trouser

	0	1
0	2.915514e+00	-1.152324e-16
1	-1.152324e-16	1.037467e+00

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

Lecture 25 – Data 100, Fall 2024¶

PCA with SVD¶

Step 1: Center the Data Matrix XX¶

Step 2: Get the SVD of centered XX¶

Step 3 Computing Approximations to the Data¶

Using Z=U∗SZ = U * S¶

Using Z=X∗VZ = X * V¶

Lower Rank Approximation of X¶

Congressional Vote Records¶

PCA¶

PCA plot¶

Component Scores¶

Scree plot¶

Incorporating Member Information¶

Analysis: Regular Voters¶

Exploring the Principal Components¶

Biplot¶

Fashion-MNIST dataset¶

Load data¶

Visualizing images¶

PCA¶

Examining PCA Results¶

Step 1: Center the Data Matrix $X$ ¶

Step 2: Get the SVD of centered $X$ ¶

Using $Z = U * S$ ¶

Using $Z = X * V$ ¶

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16