import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import yaml
from datetime import datetime
from ds100_utils import *
import plotly.express as px

rectangle = pd.read_csv("data/rectangle_data.csv")
rectangle

X = rectangle - np.mean(rectangle, axis = 0)
X.head(10)

Xstd = X / np.std(X, axis = 0)

U, S, Vt = np.linalg.svd(X, full_matrices = False)

print("Shape of U", U.shape)
print("Shape of S", S.shape)
print("Shape of Vt", Vt.shape)

Shape of U (100, 4)
Shape of S (4,)
Shape of Vt (4, 4)

Sm = np.diag(S)
Sm

array([[1.97388075e+02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.74346257e+01, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.32626119e+01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8.53898299e-15]])

np.isclose(S[3], 0)

True

S.round(5)

array([197.38808,  27.43463,  23.26261,   0.     ])

pd.DataFrame(np.round(np.diag(S),3))

pd.DataFrame(np.round(S**2 / X.shape[0], 3))

U, S, Vt = np.linalg.svd(Xstd, full_matrices = False)

pd.DataFrame(np.round(S**2 / X.shape[0], 2))

Z = U[:, :2] @ np.diag(S[:2])
pd.DataFrame(Z).head()

Z = Xstd.to_numpy() @ Vt.T[:,:2]
pd.DataFrame(Z).head()

px.scatter(x=Z[:, 0], y=Z[:, 1])

from sklearn.decomposition import PCA
pca = PCA(2)
pd.DataFrame(pca.fit_transform(X)).head(5)

pd.DataFrame(Z).head()

pd.DataFrame(pca.fit_transform(Xstd)).head(5)

pd.DataFrame(np.cov(Z.T))

rectangle.head()

k = 2
U, S, Vt = np.linalg.svd(Xstd, full_matrices = False)
scaling = np.diag(np.std(X, axis = 0))
# scaling = np.eye(X.shape[1])
Z = U[:,:k] @ np.diag(S[:k])

rectangle_hat = pd.DataFrame(
    (Z @ Vt[:k, :]) @ scaling + np.mean(rectangle, axis = 0).to_numpy(),
    columns = rectangle.columns)

display(rectangle_hat.head(3))

fig = px.scatter_3d(rectangle, x="width", y="height", z="area",
                    width=800, height=800)
fig.add_scatter3d(x=rectangle_hat["width"], y=rectangle_hat["height"], z=rectangle_hat["area"], 
                  mode="markers", name = "approximation")

# February 2019 House of Representatives roll call votes
# Downloaded using https://github.com/eyeseast/propublica-congress
votes = pd.read_csv('data/votes.csv')
votes = votes.astype({"roll call": str}) 
votes

def was_yes(s):
    return 1 if s.iloc[0] == "Yes" else 0    
vote_pivot = votes.pivot_table(index='member', 
                                columns='roll call', 
                                values='vote', 
                                aggfunc=was_yes, 
                                fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()

(441, 41)

vote_pivot_centered = vote_pivot - np.mean(vote_pivot, axis = 0)
vote_pivot_centered

vote_pivot_centered.shape

(441, 41)

u, s, vt = np.linalg.svd(vote_pivot_centered, full_matrices = False)

vote_2d = pd.DataFrame(index = vote_pivot_centered.index)
vote_2d[["z1", "z2", "z3"]] = (u * s)[:, :3]
px.scatter(vote_2d, x='z1', y='z2', title='Vote Data', width=800, height=600)

np.round(s**2 / sum(s**2), 2)

array([0.8 , 0.05, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

fig = px.line(y=s**2 / sum(s**2), title='Variance Explained', width=700, height=600, markers=True)
fig.update_xaxes(title_text='Principal Component')
fig.update_yaxes(title_text='Proportion of Variance Explained')

fig = px.scatter_3d(vote_2d, x='z1', y='z2', z='z3', title='Vote Data', width=800, height=600)
fig.update_traces(marker=dict(size=5))

# You can get current information about legislators with this code. In our case, we'll use
# a static copy of the 2019 membership roster to properly match our voting data.

# base_url = 'https://raw.githubusercontent.com/unitedstates/congress-legislators/main/'
# legislators_path = 'legislators-current.yaml'
# f = fetch_and_cache(base_url + legislators_path, legislators_path)

# Use 2019 data copy
legislators_data = yaml.safe_load(open('data/legislators-2019.yaml'))

def to_date(s):
    return datetime.strptime(s, '%Y-%m-%d')

legs = pd.DataFrame(
    columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
    data=[[x['id']['bioguide'], 
           x['name']['first'],
           x['name']['last'],
           x['bio']['gender'],
           x['terms'][-1]['state'],
           x['terms'][-1]['type'],
           x['terms'][-1]['party'],
           to_date(x['bio']['birthday'])] for x in legislators_data])
legs['age'] = 2024 - legs['birthday'].dt.year
legs.set_index("leg_id")
legs.sort_index()

vote_2d = vote_2d.join(legs.set_index('leg_id')).dropna()

px.scatter(vote_2d, x='z1', y='z2', color='party', symbol="gender", size='age',
           title='Vote Data', width=800, height=600, size_max=10,
           opacity = 0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'])

np.random.seed(42)
vote_2d['z1_jittered'] = vote_2d['z1'] + np.random.normal(0, 0.1, len(vote_2d))
vote_2d['z2_jittered'] = vote_2d['z2'] + np.random.normal(0, 0.1, len(vote_2d))
vote_2d['z3_jittered'] = vote_2d['z3'] + np.random.normal(0, 0.1, len(vote_2d))

px.scatter(vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='age',
           title='Vote Data', width=800, height=600, size_max=10,
           opacity = 0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'])

px.scatter_3d(vote_2d, x='z1_jittered', y='z2_jittered', z='z3_jittered', color='party', symbol="gender", size='age',
           title='Vote Data', width=800, height=600, size_max=10,
           opacity = 0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'])

vote_2d["num votes"] = (
    votes[votes["vote"].isin(["Yes", "No"])]
        .groupby("member").size()
)
vote_2d.dropna(inplace=True)
vote_2d.head()

# histogram with a jittered marginal
px.histogram(vote_2d, x="num votes", log_x=True, width=800, height=600)

px.scatter(vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='num votes',
           title='Vote Data (Size is Number of Votes)', width=800, height=600, size_max=10,
           opacity = 0.7,
           color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
           hover_data=['first', 'last', 'state', 'party', 'gender', 'age'])

fig_eig = px.bar(x=vote_pivot_centered.columns, y=vt[0,:])
# extract the trace from the figure
fig_eig

party_line_votes = (
    vote_pivot_centered.join(legs.set_index("leg_id")['party'])
                       .groupby("party").mean()
                       .T.reset_index()
                       .rename(columns={"index": "call"})
                       .melt("call")
)
fig = px.bar(
    party_line_votes,
    x="call", y="value", facet_row = "party", color="party",
    color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"})
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

fig_eig

loadings = pd.DataFrame({"pc1": np.sqrt(s[0]) * vt[0,:], "pc2": np.sqrt(s[1])*vt[1,:]}, index=vote_pivot_centered.columns)   
loadings.head()

fig = px.scatter(
    vote_2d, x='z1_jittered', y='z2_jittered', color='party', symbol="gender", size='num votes',
    title='Biplot', width=800, height=600, size_max=10,
    opacity = 0.7,
    color_discrete_map={'Democrat':'blue', 'Republican':'red', "Independent": "green"},
    hover_data=['first', 'last', 'state', 'party', 'gender', 'age'])

for (call, pc1, pc2) in loadings.head(20).itertuples():
    fig.add_scatter(x=[0,pc1], y=[0,pc2], name=call, 
                    mode='lines+markers', textposition='top right',
                    marker= dict(size=10,symbol= "arrow-bar-up", angleref="previous"))
fig

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

	width	height	area	perimeter
0	2.97	1.35	24.78	8.64
1	-3.03	-0.65	-15.22	-7.36
2	-4.03	-1.65	-20.22	-11.36
3	3.97	-1.65	3.78	4.64
4	3.97	3.35	48.78	14.64
5	-2.03	-3.65	-20.22	-11.36
6	-1.03	-2.65	-15.22	-7.36
7	0.97	0.35	6.78	2.64
8	1.97	-3.65	-16.22	-3.36
9	2.97	-2.65	-7.22	0.64

	0
0	389.621
1	7.527
2	5.411
3	0.000

	0
0	2.89
1	1.03
2	0.09
3	0.00

	0	1
0	-2.165116	0.250465
1	1.659346	-0.502647
2	2.462645	-0.416318
3	-0.856182	1.483758
4	-3.884287	-0.182623

Lecture 25 – Data 100, Spring 2024¶

PCA with SVD¶

Step 1: Center the Data Matrix $X$¶

Step 2: Get the SVD of centered $X$¶

Step 3 Computing Approximations to the Data¶

Using $Z = U * S$¶

Using $Z = X * V$¶

Lower Rank Approximation of X¶

Congressional Vote Records¶

PCA¶

PCA plot¶

Component Scores¶

Scree plot¶

Incorporating Member Information¶

Analysis: Regular Voters¶

Exploring the Principal Components¶

Biplot¶

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

	0	1	2
0	197.388	0.000	0.000
1	0.000	27.435	0.000
2	0.000	0.000	23.263
3	0.000	0.000	0.000

	0	1
0	26.432217	0.162686
1	-17.045285	-2.181451
2	-23.245695	-3.538040
3	5.383546	5.025395
4	51.085217	-2.586948

	0	1
0	2.165116	0.250465
1	-1.659346	-0.502647
2	-2.462645	-0.416318
3	0.856182	1.483758
4	3.884287	-0.182623

	width	height	area	perimeter
0	8.111911	6.093164	45.859191	28.410150
1	2.103964	4.086549	6.011204	12.381026
2	1.285956	3.238055	-2.470229	9.048021

	chamber	session	roll call	member	vote
0	House	1	555	A000374	Not Voting
1	House	1	555	A000370	Yes
2	House	1	555	A000055	No
3	House	1	555	A000371	Yes
4	House	1	555	A000372	No
...	...	...	...	...	...
17823	House	1	515	Y000062	Yes
17824	House	1	515	Y000065	No
17825	House	1	515	Y000033	Yes
17826	House	1	515	Z000017	Yes
17827	House	1	515	P000197	Speaker

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	1	0	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000367	0	0	0	0	0	0	0	0	0	0	...	0	1	1	1	1	0	1	1	0	1
A000369	1	1	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000370	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1
A000371	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	0.129252	-0.668934	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000367	-0.870748	-0.668934	-0.526077	-0.52381	-0.950113	-0.412698	-0.562358	-0.365079	-0.405896	-0.439909	...	-0.521542	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	-0.913832	0.496599
A000369	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000370	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
A000371	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
W000827	-0.870748	-0.668934	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	-0.954649	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
Y000033	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
Y000062	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
Y000065	-0.870748	-0.668934	-0.526077	-0.52381	-0.950113	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
Z000017	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401

	leg_id	first	last	gender	state	chamber	party	birthday	age
0	B000944	Sherrod	Brown	M	OH	sen	Democrat	1952-11-09	72
1	C000127	Maria	Cantwell	F	WA	sen	Democrat	1958-10-13	66
2	C000141	Benjamin	Cardin	M	MD	sen	Democrat	1943-10-05	81
3	C000174	Thomas	Carper	M	DE	sen	Democrat	1947-01-23	77
4	C001070	Robert	Casey	M	PA	sen	Democrat	1960-04-13	64
...	...	...	...	...	...	...	...	...	...
534	M001197	Martha	McSally	F	AZ	sen	Republican	1966-03-22	58
535	G000592	Jared	Golden	M	ME	rep	Democrat	1982-07-25	42
536	K000395	Fred	Keller	M	PA	rep	Republican	1965-10-23	59
537	B001311	Dan	Bishop	M	NC	rep	Republican	1964-07-01	60
538	M001210	Gregory	Murphy	M	NC	rep	Republican	1963-03-05	61

	z1	z2	z3	first	last	gender	state	chamber	party	birthday	age	z1_jittered	z2_jittered	z3_jittered	num votes
member
A000055	3.061356	0.364191	0.194493	Robert	Aderholt	M	AL	rep	Republican	1965-07-22	59.0	3.111028	0.358637	0.192851	40.0
A000367	0.188870	-2.433565	0.283280	Justin	Amash	M	MI	rep	Independent	1980-04-18	44.0	0.175043	-2.395158	0.402119	41.0
A000369	2.844370	0.821619	-0.476126	Mark	Amodei	M	NV	rep	Republican	1958-06-12	66.0	2.909139	0.818350	-0.223433	41.0
A000370	-2.607536	0.127977	0.031377	Alma	Adams	F	NC	rep	Democrat	1946-05-27	78.0	-2.455233	-0.078767	-0.021710	41.0
A000371	-2.607536	0.127977	0.031377	Pete	Aguilar	M	CA	rep	Democrat	1979-06-19	45.0	-2.630951	0.119065	-0.017567	41.0

	0	1
0	2.915514e+00	-3.700743e-16
1	-3.700743e-16	1.037467e+00

	pc1	pc2
roll call
515	-0.215368	1.147391
516	-0.846835	0.930164
517	-1.375641	0.180715
518	-1.371606	0.197301
519	-0.039021	0.827081

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16