import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import yaml
from datetime import datetime
from ds100_utils import *

np.random.seed(23) #kallisti
plt.rcParams['figure.dpi'] = 150


df = pd.read_csv("data/2d.csv")
df


fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

# Move left y-axis and bottom x-axis to centre, passing through (0,0)
ax.spines['left'].set_position('zero')
ax.spines['bottom'].set_position('zero')

# Eliminate upper and right axes
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')

# Show ticks in the left and lower axes only
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
sns.scatterplot(x="x", y="y", data = df);
plt.axis("square")
ax.set_ylabel("")
ax.set_xlabel("");
plt.xlim(-5, 5)
plt.ylim(-5, 10);


centered_df = df - np.mean(df, axis = 0)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

# Move left y-axis and bottom x-axis to centre, passing through (0,0)
ax.spines['left'].set_position('zero')
ax.spines['bottom'].set_position('zero')

# Eliminate upper and right axes
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')

# Show ticks in the left and lower axes only
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
sns.scatterplot(x="x", y="y", data = centered_df);
plt.axis("square")
ax.set_ylabel("")
ax.set_xlabel("");
plt.xlim(-5, 5)
plt.ylim(-5, 5);


U, S, Vt = np.linalg.svd(centered_df, full_matrices = False)


PCs = centered_df @ Vt.T

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

# Move left y-axis and bottom x-axis to centre, passing through (0,0)
ax.spines['left'].set_position('zero')
ax.spines['bottom'].set_position('zero')

# Eliminate upper and right axes
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')

# Show ticks in the left and lower axes only
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
sns.scatterplot(x=0, y=1, data = PCs);
plt.axis("square")
ax.set_ylabel("")
ax.set_xlabel("");
plt.xlim(-5, 5)
plt.ylim(-5, 5);


# February 2019 House of Representatives roll call votes
# Downloaded using https://github.com/eyeseast/propublica-congress
votes = pd.read_csv('data/votes.csv')
votes = votes.astype({"roll call": str}) 
votes


def was_yes(s):
    return 1 if s.iloc[0] == "Yes" else 0
    
vote_pivot = votes.pivot_table(index='member', 
                                columns='roll call', 
                                values='vote', 
                                aggfunc=was_yes, 
                                fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()

(441, 41)


vote_pivot_centered = vote_pivot - np.mean(vote_pivot, axis = 0)
vote_pivot_centered.head(5)


vote_pivot_centered.shape

(441, 41)


u, s, vt = np.linalg.svd(vote_pivot_centered, full_matrices = False)


pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
plt.xlabel("PC1");
plt.ylabel("PC2");


np.round(s**2 / sum(s**2), 2)

array([0.8 , 0.05, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


plt.plot(s**2 / sum(s**2), marker='.');
plt.xlabel("Principal Component $i$");
plt.ylabel("Variance Ratio");


sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
plt.xlabel("PC1");
plt.ylabel("PC2");


base_url = 'https://github.com/unitedstates/congress-legislators'
legislators_path = 'legislators-current.yaml'
f = fetch_and_cache(base_url + legislators_path, legislators_path)
legislators_data = yaml.safe_load(open('data/legislators-current.yaml'))

def to_date(s):
    return datetime.strptime(s, '%Y-%m-%d')

legs = pd.DataFrame(
    columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
    data=[[x['id']['bioguide'], 
           x['name']['first'],
           x['name']['last'],
           x['bio']['gender'],
           x['terms'][-1]['state'],
           x['terms'][-1]['type'],
           x['terms'][-1]['party'],
           to_date(x['bio']['birthday'])] for x in legislators_data])

legs.set_index("leg_id")
legs.sort_index()

Using cached version that was downloaded (UTC): Thu Aug  3 20:37:03 2023


vote2d = pd.DataFrame({
    'member': vote_pivot.index,
    'pc1': pcs[:, 0],
    'pc2': pcs[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')

vote2d[vote2d['pc1'] < 0]['party'].value_counts()

party
Democrat    231
Name: count, dtype: int64


#top right only
vote2d.query('pc2 > -2 and pc1 > 0')['party'].value_counts()

party
Republican    194
Name: count, dtype: int64


cp = sns.color_palette()
party_cp = [cp[i] for i in [0, 3, 1]]
party_hue = ["Democrat", "Republican", "Independent"]

sns.scatterplot(x="pc1", y="pc2",
                hue="party", palette=party_cp,  hue_order=party_hue,
                data = vote2d);


vote2d['pc1_jittered'] = vote2d['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
vote2d['pc2_jittered'] = vote2d['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])


sns.scatterplot(x="pc1_jittered", y="pc2_jittered", 
                hue="party", palette=party_cp,  hue_order=party_hue,
                data = vote2d);


vote2d[vote2d['pc2'] < -1]


df = votes[votes['member'].isin(vote2d[vote2d['pc2'] < -1]['member'])]
df.groupby(['member', 'vote']).size()

member   vote      
A000367  No            31
         Yes           10
A000374  Not Voting    41
B001311  No            17
         Yes            7
C000537  No             1
         Not Voting    37
         Yes            3
C000984  No             4
         Not Voting    32
         Yes            5
C001087  No             6
         Not Voting    23
         Yes           12
G000582  Not Voting     1
         Yes            6
H001077  No            14
         Not Voting    15
         Yes           12
J000299  No            20
         Not Voting     7
         Yes           14
M001200  Not Voting    41
M001210  No            16
         Not Voting     1
         Yes            7
N000147  No             6
         Not Voting     1
P000197  Speaker       41
P000610  No             7
R000577  No            11
         Not Voting     8
         Yes           22
R000600  Not Voting     7
S001177  No             7
S001204  No             6
         Not Voting     1
dtype: int64


legs.query("leg_id == 'A000367'")


bool_votes = votes["vote"].isin(["Yes", "No"])
num_yes_or_no_votes_per_member = votes[bool_votes].groupby("member").size()
num_yes_or_no_votes_per_member

member
A000055    40
A000367    41
A000369    41
A000370    41
A000371    41
           ..
W000827    33
Y000033    41
Y000062    41
Y000065    36
Z000017    41
Length: 437, dtype: int64


vote_pivot_with_yes_no_count = (
    vote_pivot.merge(num_yes_or_no_votes_per_member.to_frame(),
                     left_index = True, right_index = True, how="outer")
    .fillna(0)
    .rename(columns = {0: 'yes_no_count'})
)
vote_pivot_with_yes_no_count.head(5)


thresh = 30
keep_legs = vote_pivot_with_yes_no_count["yes_no_count"] >= thresh
regulars = vote_pivot_with_yes_no_count[keep_legs]
regulars = regulars.drop(columns='yes_no_count')
regulars.shape

(425, 41)


regulars_centered = regulars - np.mean(regulars, axis = 0)
regulars_centered.head(5)


u, s, vt = np.linalg.svd(regulars_centered, full_matrices = False)


np.round(s**2 / sum(s**2), 2)

array([0.84, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


pcs_reg = u * s

vote2d_reg = pd.DataFrame({
    'member': regulars_centered.index,
    'pc1': pcs_reg[:, 0],
    'pc2': pcs_reg[:, 1],
}).merge(legs, left_on='member', right_on='leg_id')

sns.scatterplot(x="pc1", y="pc2", data=vote2d_reg);


vote2d_reg['pc1_jittered'] = vote2d_reg['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d_reg.shape[0])
vote2d_reg['pc2_jittered'] = vote2d_reg['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d_reg.shape[0])

sns.scatterplot(x="pc1_jittered", y="pc2_jittered",
                hue="party", palette=party_cp,  hue_order=party_hue,
                data = vote2d_reg);


num_votes = vt.shape[1]
votes_cols = regulars.columns

def plot_pc(k):
    plt.bar(votes_cols, vt[k, :], alpha=0.7)
    plt.xticks(votes_cols, rotation=90);

with plt.rc_context({"figure.figsize": (12, 4)}):
    k = 1
    plot_pc(k)
    plt.title(k)


vote2d_reg.shape

(424, 13)


import random

# roll_calls = sorted([517, 520, 526, 527, 555, 553]) # features to plot on biplot
roll_calls = [515, 516, 517, 520, 526, 527, 553]


plt.figure(figsize = (7, 7))
# first plot each datapoint in terms of the first two principal components
sns.scatterplot(x="pc1_jittered", y="pc2_jittered",
                hue="party", palette=party_cp,  hue_order=party_hue,
                data = vote2d_reg);

# next, plot the loadings for PC1 and PC2
cp = sns.color_palette()[1:] # skip blue

directions_df= pd.DataFrame(data=vt[:2,:].T, index=regulars_centered.columns, columns=["dir1", "dir2"])
dir1, dir2 = directions_df["dir1"], directions_df["dir2"]
for i, feature in enumerate(roll_calls):
    feature = str(feature)
    plt.arrow(0, 0,
              dir1.loc[feature], dir2.loc[feature],
              head_width=0.2, head_length=0.2, color=cp[i], label=feature)
plt.legend(bbox_to_anchor=(1.1, 1.05));

	x	y
0	2.311043	5.436627
1	2.951447	6.093710
2	2.628517	6.776799
3	2.041157	5.335430
4	3.916969	8.948526
...	...	...
95	3.639231	8.331902
96	2.765474	5.621709
97	2.745027	7.134981
98	3.945360	8.198725
99	2.743246	6.145312

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	1	0	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000367	0	0	0	0	0	0	0	0	0	0	...	0	1	1	1	1	0	1	1	0	1
A000369	1	1	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000370	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1
A000371	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	0.129252	-0.668934	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000367	-0.870748	-0.668934	-0.526077	-0.52381	-0.950113	-0.412698	-0.562358	-0.365079	-0.405896	-0.439909	...	-0.521542	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	-0.913832	0.496599
A000369	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000370	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
A000371	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599

	leg_id	first	last	gender	state	chamber	party	birthday
0	B000944	Sherrod	Brown	M	OH	sen	Democrat	1952-11-09
1	C000127	Maria	Cantwell	F	WA	sen	Democrat	1958-10-13
2	C000141	Benjamin	Cardin	M	MD	sen	Democrat	1943-10-05
3	C000174	Thomas	Carper	M	DE	sen	Democrat	1947-01-23
4	C001070	Robert	Casey	M	PA	sen	Democrat	1960-04-13
...	...	...	...	...	...	...	...	...
534	M001197	Martha	McSally	F	AZ	sen	Republican	1966-03-22
535	G000592	Jared	Golden	M	ME	rep	Democrat	1982-07-25
536	K000395	Fred	Keller	M	PA	rep	Republican	1965-10-23
537	B001311	Dan	Bishop	M	NC	rep	Republican	1964-07-01
538	M001210	Gregory	Murphy	M	NC	rep	Republican	1963-03-05

	member	pc1	pc2	leg_id	first	last	gender	state	chamber	party	birthday	pc1_jittered	pc2_jittered
1	A000367	0.188870	-2.433565	A000367	Justin	Amash	M	MI	rep	Independent	1980-04-18	0.191451	-2.431195
6	A000374	1.247134	-3.533196	A000374	Ralph	Abraham	M	LA	rep	Republican	1954-09-16	1.210379	-3.521461
47	B001311	1.695651	-2.093912	B001311	Dan	Bishop	M	NC	rep	Republican	1964-07-01	1.686247	-1.982446
50	C000537	0.699636	-3.394179	C000537	James	Clyburn	M	SC	rep	Democrat	1940-07-21	0.687526	-3.212611
52	C000984	0.531789	-3.099044	C000984	Elijah	Cummings	M	MD	rep	Democrat	1951-01-18	0.583003	-3.131361
69	C001087	2.755060	-1.378193	C001087	Eric	Crawford	M	AR	rep	Republican	1966-01-22	2.756083	-1.270555
150	G000582	2.262007	-2.632452	G000582	Jenniffer	González-Colón	F	PR	rep	Republican	1976-08-05	2.259351	-2.656279
179	H001077	2.509474	-1.349023	H001077	Clay	Higgins	M	LA	rep	Republican	1961-08-24	2.446616	-1.346705
196	J000299	2.908823	-1.094618	J000299	Mike	Johnson	M	LA	rep	Republican	1972-01-30	3.061062	-1.116434
272	M001200	1.247134	-3.533196	M001200	A.	McEachin	M	VA	rep	Democrat	1961-10-10	1.221263	-3.559693
282	M001210	1.695651	-2.093912	M001210	Gregory	Murphy	M	NC	rep	Republican	1963-03-05	1.577282	-1.919511
285	N000147	1.247134	-3.533196	N000147	Eleanor	Norton	F	DC	rep	Democrat	1937-06-13	1.385627	-3.546085
298	P000197	1.247134	-3.533196	P000197	Nancy	Pelosi	F	CA	rep	Democrat	1940-03-26	1.217323	-3.454539
310	P000610	1.247134	-3.533196	P000610	Stacey	Plaskett	F	VI	rep	Democrat	1966-05-13	1.347833	-3.381807
323	R000577	-2.069671	-1.344435	R000577	Tim	Ryan	M	OH	rep	Democrat	1973-07-16	-1.992928	-1.423723
330	R000600	1.247134	-3.533196	R000600	Aumua	Amata	F	AS	rep	Republican	1947-12-29	1.255358	-3.499536
360	S001177	1.247134	-3.533196	S001177	Gregorio	Sablan	M	MP	rep	Democrat	1955-01-19	1.214066	-3.646465
374	S001204	1.247134	-3.533196	S001204	Michael	San Nicolas	M	GU	rep	Democrat	1981-01-30	1.395281	-3.476336

Principal Component Analysis II¶

PCA is a Linear Transformation¶

Congressional Vote Records¶

PCA¶

PCA plot¶

Component Scores¶

Scree plot¶

Incorporating Member Information¶

Analysis: Regular Voters¶

Regulars: PCA¶

Exploring $V^{\top}$¶

Biplot¶

	chamber	session	roll call	member	vote
0	House	1	555	A000374	Not Voting
1	House	1	555	A000370	Yes
2	House	1	555	A000055	No
3	House	1	555	A000371	Yes
4	House	1	555	A000372	No
...	...	...	...	...	...
17823	House	1	515	Y000062	Yes
17824	House	1	515	Y000065	No
17825	House	1	515	Y000033	Yes
17826	House	1	515	Z000017	Yes
17827	House	1	515	P000197	Speaker

	515	516	517	518	519	520	521	522	523	524	...	547	548	549	550	551	552	553	554	555	yes_no_count
member
A000055	1	0	0	0	1	1	0	1	1	1	...	0	1	0	0	1	0	0	1	0	40.0
A000367	0	0	0	0	0	0	0	0	0	0	...	1	1	1	1	0	1	1	0	1	41.0
A000369	1	1	0	0	1	1	0	1	1	1	...	0	1	0	0	1	0	0	1	0	41.0
A000370	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	0	1	1	1	1	41.0
A000371	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	0	1	1	1	1	41.0

	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	0.101176	-0.691765	-0.543529	-0.541176	0.023529	0.581176	-0.581176	0.625882	0.588235	0.550588	...	-0.541176	-0.545882	0.014118	-0.538824	-0.536471	0.529412	-0.538824	-0.555294	0.056471	-0.522353
A000367	-0.898824	-0.691765	-0.543529	-0.541176	-0.976471	-0.418824	-0.581176	-0.374118	-0.411765	-0.449412	...	-0.541176	0.454118	0.014118	0.461176	0.463529	-0.470588	0.461176	0.444706	-0.943529	0.477647
A000369	0.101176	0.308235	-0.543529	-0.541176	0.023529	0.581176	-0.581176	0.625882	0.588235	0.550588	...	-0.541176	-0.545882	0.014118	-0.538824	-0.536471	0.529412	-0.538824	-0.555294	0.056471	-0.522353
A000370	0.101176	0.308235	0.456471	0.458824	0.023529	-0.418824	0.418824	-0.374118	-0.411765	-0.449412	...	0.458824	0.454118	0.014118	0.461176	0.463529	-0.470588	0.461176	0.444706	0.056471	0.477647
A000371	0.101176	0.308235	0.456471	0.458824	0.023529	-0.418824	0.418824	-0.374118	-0.411765	-0.449412	...	0.458824	0.454118	0.014118	0.461176	0.463529	-0.470588	0.461176	0.444706	0.056471	0.477647