import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(23) #kallisti

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
sns.set()


rectangle = pd.read_csv("rectangle_data.csv")
rectangle.tail(5)


u, s, vt = np.linalg.svd(rectangle, full_matrices = False)


usig = u * s


pd.DataFrame(usig @ vt).head(4)


np.set_printoptions(suppress=True)
usig

array([[-56.30926787,   4.08369641,  -0.76796869,   0.        ],
       [-13.92587137,  -5.61592446,   1.59106852,  -0.        ],
       [ -7.3883695 ,  -5.11089273,   1.51352951,   0.        ],
       [-36.84443159,  -4.80005945,  -3.80095908,  -0.        ],
       [-79.47260546,  13.00269827,   0.18659785,  -0.        ],
       [ -7.42135662,  -5.11810904,  -1.31469604,   0.        ],
       [-13.95885849,  -5.62314077,  -1.23715703,   0.        ],
       [-37.98955728,  -1.31360807,  -0.26071277,   0.        ],
       [-15.6692269 ,  -9.65347804,  -4.03555325,   0.        ],
       [-25.44680915,  -7.81311695,  -3.92620778,  -0.        ],
       [-32.68750933,  -2.52515864,   0.38769508,  -0.        ],
       [-53.89570114,   2.32104364,  -2.20593631,  -0.        ],
       [-40.87803851,  -1.86471027,  -2.34708823,  -0.        ],
       [ -5.34289549,  -3.98065864,   0.77963103,   0.        ],
       [-28.2033419 ,  -8.33535389,   5.30031897,   0.        ],
       [-64.00838899,   7.0615079 ,   1.43570386,  -0.        ],
       [-43.29160524,  -0.1020575 ,  -0.90912062,   0.        ],
       [-13.92587137,  -5.61592446,   1.59106852,  -0.        ],
       [-16.78136548,  -6.15981034,   2.33291861,   0.        ],
       [-18.43439279,  -4.99433025,  -0.47940371,   0.        ],
       [-17.73119447, -10.78732028,  -4.71576755,  -0.        ],
       [-48.59365319,   1.10949307,  -1.55752846,  -0.        ],
       [-79.47260546,  13.00269827,   0.18659785,  -0.        ],
       [-29.48041607,  -4.87776777,  -2.47233693,   0.        ],
       [-33.16242383,  -4.83891361,  -3.13664801,   0.        ],
       [-44.08513177,   0.48789886,   0.51294377,   0.        ],
       [-31.8939828 ,  -3.115115  ,  -1.03436931,   0.        ],
       [-40.87803851,  -1.86471027,  -2.34708823,  -0.        ],
       [-22.57482149,  -7.2656229 ,  -3.25394509,   0.        ],
       [-22.90992708,  -4.36551973,   0.27834961,  -0.        ],
       [ -9.48332419,  -6.25195129,  -1.99491034,   0.        ],
       [-70.94697069,   9.44214673,  -0.61091354,  -0.        ],
       [-13.95885849,  -5.62314077,  -1.23715703,   0.        ],
       [ -5.34289549,  -3.98065864,   0.77963103,   0.        ],
       [-17.73119447, -10.78732028,  -4.71576755,  -0.        ],
       [-27.40195494,  -3.74031736,  -0.37800985,  -0.        ],
       [-53.89570114,   2.32104364,  -2.20593631,  -0.        ],
       [-62.37185523,   5.89241965,   2.83391341,  -0.        ],
       [-29.41444183,  -4.86333514,   3.18411417,  -0.        ],
       [-29.48041607,  -4.87776777,  -2.47233693,   0.        ],
       [-57.1027944 ,   4.67365277,   0.6540957 ,  -0.        ],
       [-14.75238503,  -5.03318441,   0.18490736,   0.        ],
       [-32.68750933,  -2.52515864,   0.38769508,  -0.        ],
       [-29.48041607,  -4.87776777,  -2.47233693,   0.        ],
       [-25.44680915,  -7.81311695,  -3.92620778,  -0.        ],
       [-19.63685958,  -6.70369623,   3.0747687 ,   0.        ],
       [-15.6692269 ,  -9.65347804,  -4.03555325,   0.        ],
       [ -5.34289549,  -3.98065864,   0.77963103,   0.        ],
       [-22.08341343,  -4.94825977,   1.68451077,   0.        ],
       [-19.63685958,  -6.70369623,   3.0747687 ,   0.        ],
       [-37.97306372,  -1.30999991,   1.15340001,  -0.        ],
       [-19.79316204, -11.92116253,  -5.39598185,  -0.        ],
       [-25.79840831,  -4.91662193,  -1.80802586,   0.        ],
       [-19.63685958,  -6.70369623,   3.0747687 ,   0.        ],
       [-56.27628075,   4.09091272,   2.06025686,   0.        ],
       [-19.79316204, -11.92116253,  -5.39598185,  -0.        ],
       [-25.34784779,  -7.79146801,   4.55846888,   0.        ],
       [-29.41444183,  -4.86333514,   3.18411417,  -0.        ],
       [-37.98955728,  -1.31360807,  -0.26071277,   0.        ],
       [-56.30926787,   4.08369641,  -0.76796869,  -0.        ],
       [-28.31879682,  -8.36061099,  -4.59847046,  -0.        ],
       [-36.74547023,  -4.77841051,   4.68371758,  -0.        ],
       [-16.78136548,  -6.15981034,   2.33291861,   0.        ],
       [ -9.43384352,  -6.24112682,   2.24742798,   0.        ],
       [-14.75238503,  -5.03318441,   0.18490736,   0.        ],
       [-28.2033419 ,  -8.33535389,   5.30031897,   0.        ],
       [-31.8939828 ,  -3.115115  ,  -1.03436931,   0.        ],
       [-16.83084616,  -6.17063481,  -1.90941971,  -0.        ],
       [-19.63685958,  -6.70369623,   3.0747687 ,   0.        ],
       [-16.78136548,  -6.15981034,   2.33291861,   0.        ],
       [ -7.3883695 ,  -5.11089273,   1.51352951,   0.        ],
       [ -7.3883695 ,  -5.11089273,   1.51352951,   0.        ],
       [-50.18070626,   2.28940579,   1.28660032,   0.        ],
       [-56.30926787,   4.08369641,  -0.76796869,  -0.        ],
       [-22.49235369,  -7.24758212,   3.81661879,   0.        ],
       [-31.86099568,  -3.10789868,   1.79385624,  -0.        ],
       [ -3.29742148,  -2.85042454,   0.04573256,   0.        ],
       [-79.47260546,  13.00269827,   0.18659785,  -0.        ],
       [-32.68750933,  -2.52515864,   0.38769508,  -0.        ],
       [-36.74547023,  -4.77841051,   4.68371758,  -0.        ],
       [-25.34784779,  -7.79146801,   4.55846888,   0.        ],
       [ -8.21488316,  -4.52815268,   0.10736835,   0.        ],
       [-33.16242383,  -4.83891361,  -3.13664801,   0.        ],
       [-17.61573956, -10.76206319,   5.18302188,   0.        ],
       [-11.54529176,  -7.38579354,  -2.67512465,  -0.        ],
       [-43.25861812,  -0.09484119,   1.91910494,  -0.        ],
       [-53.89570114,   2.32104364,  -2.20593631,  -0.        ],
       [ -7.42135662,  -5.11810904,  -1.31469604,   0.        ],
       [-70.94697069,   9.44214673,  -0.61091354,  -0.        ],
       [-48.59365319,   1.10949307,  -1.55752846,  -0.        ],
       [-79.47260546,  13.00269827,   0.18659785,  -0.        ],
       [-27.38546138,  -3.73670921,   1.03610292,  -0.        ],
       [-27.38546138,  -3.73670921,   1.03610292,  -0.        ],
       [-11.47931753,  -7.37136091,   2.98132646,   0.        ],
       [-44.08513177,   0.48789886,   0.51294377,   0.        ],
       [-48.59365319,   1.10949307,  -1.55752846,  -0.        ],
       [-64.02488255,   7.05789975,   0.02159108,  -0.        ],
       [ -9.43384352,  -6.24112682,   2.24742798,   0.        ],
       [-13.52479154,  -8.501595  ,   3.71522493,  -0.        ],
       [-19.63685958,  -6.70369623,   3.0747687 ,  -0.        ]])


pd.DataFrame(usig[:, 0:3] @ vt[0:3, ]).head(4)


pd.DataFrame(usig[:, 0:2] @ vt[0:2, ]).tail(4)


pd.DataFrame(usig[:, 0:1] @ vt[0:1, ]).tail(4)


# Downloads from https://www.gapminder.org/data/
cm_path = 'child_mortality_0_5_year_olds_dying_per_1000_born.csv'
fe_path = 'children_per_woman_total_fertility.csv'
cm = pd.read_csv(cm_path).set_index('country')['2017'].to_frame()/10
fe = pd.read_csv(fe_path).set_index('country')['2017'].to_frame()
child_data = cm.merge(fe, left_index=True, right_index=True).dropna()
child_data.columns = ['mortality', 'fertility']
child_data.head()


def scatter14(data):
    sns.scatterplot('mortality', 'fertility', data=data)
    plt.xlim([0, 14])
    plt.ylim([0, 14])
    plt.xticks(np.arange(0, 14, 2))
    plt.yticks(np.arange(0, 14, 2))    
    
scatter14(child_data)

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


sns.scatterplot('mortality', 'fertility', data=child_data)

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='mortality', ylabel='fertility'>


u, s, vt = np.linalg.svd(child_data, full_matrices = False)


child_data_reconstructed = pd.DataFrame(u @ np.diag(s) @ vt, columns = ["mortality", "fertility"], index=child_data.index)


child_data_reconstructed.head(5)


sns.scatterplot('mortality', 'fertility', data=child_data)

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='mortality', ylabel='fertility'>


#Rather than manually invoking linalg.svd over and over, let's just
#define a function that does the rank approximation in one function call
def compute_rank_k_approximation(data, k):
    u, s, vt = np.linalg.svd(data, full_matrices = False)
    return pd.DataFrame(u[:, 0:k] @ np.diag(s[0:k]) @ vt[0:k, :], columns = data.columns)


#child_data_rank_1_approximation = pd.DataFrame(u[:, :-1] @ np.diag(s[:-1]) @ vt[:-1, :], columns = ["mortality", "fertility"], index=child_data.index)

child_data_rank_1_approximation = compute_rank_k_approximation(child_data, 1)
child_data_rank_1_approximation.head(5)


sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='mortality', ylabel='fertility'>


sns.scatterplot('mortality', 'fertility', data=child_data)
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

<AxesSubplot:xlabel='mortality', ylabel='fertility'>


#http://jse.amstat.org/datasets/fat.txt
body_data = pd.read_fwf("fat.dat.txt", colspecs = [(9, 13), (17, 21), (23, 29), (35, 37),
                                             (39, 45), (48, 53), (57, 61), (64, 69),
                                             (73, 77), (80, 85), (88, 93), (96, 101),
                                             (105, 109), (113, 117), (121, 125), (129, 133),
                                             (137, 141), (145, 149)], 
                  
                  
                  header=None, names = ["% brozek fat", "% siri fat", "density", "age", 
                                       "weight", "height", "adiposity", "fat free weight",
                                       "neck", "chest", "abdomen", "hip", "thigh",
                                       "knee", "ankle", "bicep", "forearm",
                                       "wrist"])
#body_data = body_data.drop(41) #drop the weird record
body_data.head()


density_and_abdomen = body_data[["density", "abdomen"]]
density_and_abdomen.head(5)


density_and_abdomen_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen, 1)
density_and_abdomen_rank_1_approximation.head(5)


sns.scatterplot(x="density", y="abdomen", data=body_data)

<AxesSubplot:xlabel='density', ylabel='abdomen'>


density_and_abdomen_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen, 1)
sns.scatterplot(x="density", y="abdomen", data=body_data)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_rank_1_approximation);


np.mean(density_and_abdomen, axis = 0)

density     1.055574
abdomen    92.555952
dtype: float64


density_and_abdomen_centered = density_and_abdomen - np.mean(density_and_abdomen, axis = 0)
density_and_abdomen_centered.head(5)


density_and_abdomen_centered_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen_centered, 1)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_centered)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_centered_rank_1_approximation);


sns.scatterplot(data = child_data, x = "mortality", y= "fertility")
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));


np.mean(child_data, axis = 0)

mortality    3.034590
fertility    2.775956
dtype: float64


child_means = np.mean(child_data, axis = 0)
child_data_centered = child_data - child_means
sns.scatterplot(data = child_data_centered, x = "mortality", y= "fertility")


plt.xlim([-3, 11])
plt.ylim([-3, 11])
plt.xticks(np.arange(-3, 11, 2))
plt.yticks(np.arange(-3, 11, 2));
plt.gcf().savefig("mortality_fertility_centered.png", dpi=300, bbox_inches="tight")


rectangle_centered = rectangle - np.mean(rectangle, axis = 0)


np.var(rectangle_centered)

width          7.6891
height         5.3475
area         338.7316
perimeter     50.7904
dtype: float64


sum(np.var(rectangle_centered))

402.5586000000001


u, s, vt = np.linalg.svd(rectangle_centered, full_matrices = False)


u[0:5, :]

array([[-0.1339099 ,  0.00592996,  0.03473374, -0.29683623],
       [ 0.08635418, -0.07951453,  0.01494809,  0.71147825],
       [ 0.11776646, -0.12896256,  0.08577428, -0.06531837],
       [-0.02727392,  0.1831771 ,  0.01089507, -0.03105541],
       [-0.258806  , -0.09429499,  0.09027015, -0.03281766]])


np.diag(s)

array([[197.38807512,   0.        ,   0.        ,   0.        ],
       [  0.        ,  27.43462569,   0.        ,   0.        ],
       [  0.        ,   0.        ,  23.26261195,   0.        ],
       [  0.        ,   0.        ,   0.        ,   0.        ]])


s**2/rectangle_centered.shape[0]

array([389.62052198,   7.52658687,   5.41149115,   0.        ])


sum(s**2/rectangle_centered.shape[0])

402.5586000000005


body_data.head(5)


u, s, vt = np.linalg.svd(body_data, full_matrices = False)

s

array([5020.52705061,  262.08505563,  254.54343189,  141.81242773,
         59.87838101,   52.91069761,   41.65678998,   31.28233064,
         26.02717253,   24.49195578,   22.11282152,   20.52806948,
         18.66671232,   16.7184654 ,   12.42776311,    7.7927621 ,
          1.9706795 ,    0.27219341])


np.round(s**2 / sum(s**2), 2)

array([0.99, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


body_data_centered = body_data - np.mean(body_data, axis = 0)
body_data_centered.head(5)


u, s, vt = np.linalg.svd(body_data_centered, full_matrices = False)

s

array([586.34726844, 261.81908925, 167.11383507,  59.97080768,
        53.33024862,  42.24150036,  34.80110599,  29.28559968,
        25.54261691,  23.91816652,  20.83965308,  20.47360977,
        18.54040972,  16.27886966,  12.37794822,   7.72010337,
         1.95363897,   0.04487061])


np.round(s**2 / sum(s**2), 2)

array([0.76, 0.15, 0.06, 0.01, 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


plt.plot(s**2);


body_data_rank_3_approximation = compute_rank_k_approximation(body_data_centered, 3) +  np.mean(body_data, axis = 0)
body_data_rank_3_approximation.head(5)


body_data.head(5)


u, s, vt = np.linalg.svd(body_data_centered, full_matrices = False)
pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);


np.argmax(pcs[:, 0])

38


body_data.iloc[38, :]

% brozek fat        33.8000
% siri fat          35.2000
density              1.0202
age                 46.0000
weight             363.1500
height              72.2500
adiposity           48.9000
fat free weight    240.5000
neck                51.2000
chest              136.2000
abdomen            148.1000
hip                147.7000
thigh               87.3000
knee                49.1000
ankle               29.6000
bicep               45.0000
forearm             29.0000
wrist               21.4000
Name: 38, dtype: float64


from ds100_utils import fetch_and_cache
import yaml
from datetime import datetime


#base_url = 'https://github.com/unitedstates/congress-legislators/raw/master/'
#legislators_path = 'legislators-current.yaml'
#f = fetch_and_cache(base_url + legislators_path, legislators_path)
legislators_data = yaml.safe_load(open('./data/legislators-2019.yaml'))

def to_date(s):
    return datetime.strptime(s, '%Y-%m-%d')

legs = pd.DataFrame(
    columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
    data=[[x['id']['bioguide'], 
           x['name']['first'],
           x['name']['last'],
           x['bio']['gender'],
           x['terms'][-1]['state'],
           x['terms'][-1]['type'],
           x['terms'][-1]['party'],
           to_date(x['bio']['birthday'])] for x in legislators_data])

legs.head(3)


# February 2019 House of Representatives roll call votes
# Downloaded using https://github.com/eyeseast/propublica-congress
votes = pd.read_csv('votes.csv')
votes = votes.astype({"roll call": str}) 
votes.head()


def was_yes(s):
    if s.iloc[0] == 'Yes':
        return 1
    else:
        return 0
    
vote_pivot = votes.pivot_table(index='member', 
                                columns='roll call', 
                                values='vote', 
                                aggfunc=was_yes, 
                                fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()

(441, 41)


vote_pivot_centered = vote_pivot - np.mean(vote_pivot, axis = 0)
vote_pivot_centered.head(5)


u, s, vt = np.linalg.svd(vote_pivot_centered, full_matrices = False)


np.round(s**2 / sum(s**2), 2)

array([0.8 , 0.05, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


plt.plot(s**2)

[<matplotlib.lines.Line2D at 0x7f943c275400>]


pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);


vote2d = pd.DataFrame({
    'member': vote_pivot.index,
    'pc1': pcs[:, 0],
    'pc2': pcs[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')

vote2d[vote2d['pc1'] < 0]['party'].value_counts()

Democrat    231
Name: party, dtype: int64


#top right only
vote2d.query('pc2 > -2 and pc1 > 0')['party'].value_counts()

Republican    194
Name: party, dtype: int64


sns.scatterplot(x="pc1", y="pc2", hue="party", data = vote2d);


vote2d['pc1_jittered'] = vote2d['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
vote2d['pc2_jittered'] = vote2d['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])


sns.scatterplot(x="pc1_jittered", y="pc2_jittered", hue="party", data = vote2d);


vote2d.head(5)


vote2d[vote2d['pc1'] > 0]['party'].value_counts()

Republican     199
Democrat         8
Independent      1
Name: party, dtype: int64


vote2d[vote2d['pc2'] < -1]


df = votes[votes['member'].isin(vote2d[vote2d['pc2'] < -1]['member'])]
df.groupby(['member', 'vote']).size()

member   vote      
A000367  No            31
         Yes           10
A000374  Not Voting    41
B001311  No            17
         Yes            7
C000537  No             1
         Not Voting    37
         Yes            3
C000984  No             4
         Not Voting    32
         Yes            5
C001087  No             6
         Not Voting    23
         Yes           12
G000582  Not Voting     1
         Yes            6
H001077  No            14
         Not Voting    15
         Yes           12
J000299  No            20
         Not Voting     7
         Yes           14
M001200  Not Voting    41
M001210  No            16
         Not Voting     1
         Yes            7
N000147  No             6
         Not Voting     1
P000197  Speaker       41
P000610  No             7
R000577  No            11
         Not Voting     8
         Yes           22
R000600  Not Voting     7
S001177  No             7
S001204  No             6
         Not Voting     1
dtype: int64


legs.query("leg_id == 'A000367'")


votes.head(5)


num_yes_or_no_votes_per_member = votes.query("vote == 'Yes' or vote == 'No'").groupby("member").size()
num_yes_or_no_votes_per_member.head(5)

member
A000055    40
A000367    41
A000369    41
A000370    41
A000371    41
dtype: int64


vote_pivot_with_yes_no_count = vote_pivot.merge(num_yes_or_no_votes_per_member.to_frame(), left_index = True, right_index = True, how="outer", ).fillna(0)
vote_pivot_with_yes_no_count = vote_pivot_with_yes_no_count.rename(columns = {0: 'yes_no_count'})
vote_pivot_with_yes_no_count.head(5)


regulars = vote_pivot_with_yes_no_count.query('yes_no_count >= 30')
regulars = regulars.drop('yes_no_count', 1)
regulars.shape

/tmp/ipykernel_190/3052268403.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.
  regulars = regulars.drop('yes_no_count', 1)

(425, 41)


regulars_centered = regulars - np.mean(regulars, axis = 0)
regulars_centered.head(5)


u, s, vt = np.linalg.svd(regulars_centered, full_matrices = False)


np.round(s**2 / sum(s**2), 2)

array([0.84, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])


pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);


vote2d = pd.DataFrame({
    'member': regulars_centered.index,
    'pc1': pcs[:, 0],
    'pc2': pcs[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')

vote2d['pc1_jittered'] = vote2d['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
vote2d['pc2_jittered'] = vote2d['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])


sns.scatterplot(x="pc1_jittered", y="pc2_jittered", hue="party", data = vote2d);


num_votes = vt.shape[1]
votes = regulars.columns

def plot_pc(k):
    plt.bar(votes, vt[k, :], alpha=0.7)
    plt.xticks(votes, rotation=90);

with plt.rc_context({"figure.figsize": (12, 4)}):
    #plot_pc(0)
    plot_pc(1)


means = np.mean(child_data, axis = 0)
child_data_centered = child_data - np.mean(child_data, axis = 0)
child_data_rank_1_approximation = compute_rank_k_approximation(child_data_centered, 1) + means


sns.scatterplot('mortality', 'fertility', data=child_data)
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)
plt.legend(['data', 'rank 1 approx'])
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


sns.scatterplot('mortality', 'fertility', data=child_data)
sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color='black')
plt.legend(['rank 1 approx', 'data']);
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


x, y = child_data['mortality'], child_data['fertility']
slope_x, intercept_x = np.polyfit(x, y, 1) # simple linear regression

scatter14(child_data)

plt.plot(x, slope_x * x + intercept_x, c = 'g')
for _, row in child_data.sample(20).iterrows():
    tx, ty = row['mortality'], row['fertility']
    plt.plot([tx, tx], [slope_x * tx + intercept_x, ty], c='red')
    
plt.legend(['predicted fertility']);

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


x, y = child_data['mortality'], child_data['fertility']
slope_y, intercept_y = np.polyfit(y, x, 1) # simple linear regression

scatter14(child_data)

plt.plot(slope_y * y + intercept_y, y, c = 'purple')
for _, row in child_data.sample(20).iterrows():
    tx, ty = row['mortality'], row['fertility']
    plt.plot([tx, slope_y * ty + intercept_y], [ty, ty], c='red')
    
plt.legend(['predicted mortality']);

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot('mortality', 'fertility', data=child_data)
plt.legend(['rank 1 approx', 'predicted fertility', 'predicted mortality'])
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot('mortality', 'fertility', data=child_data)

for idx, tdata in child_data.reset_index().sample(20).iterrows():
    tx = tdata["mortality"]
    ty = tdata["fertility"]
    tx_projected = child_data_rank_1_approximation.iloc[idx, 0]
    ty_projected = child_data_rank_1_approximation.iloc[idx, 1]
    plt.plot([tx, tx_projected], [ty, ty_projected], c='red')
    
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
plt.legend(['rank 1 approx', 'predicted fertility', 'predicted mortality']);

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(


def plot_x_regression_y_regression_1d_approximation(data): 
    xname = data.columns[0]
    yname = data.columns[1]
    
    x, y = data[xname], data[yname]
    slope_x, intercept_x = np.polyfit(x, y, 1) # simple linear regression
    
    x, y = data[xname], data[yname]
    slope_y, intercept_y = np.polyfit(y, x, 1) # simple linear regression
    
    means = np.mean(data, axis = 0)
    rank_1_approximation = compute_rank_k_approximation(data - means, 1) + means
    
    sns.lineplot(x=xname, y=yname, data=rank_1_approximation, color="black")
    plt.plot(x, slope_x * x + intercept_x, c = 'g')
    plt.plot(slope_y * y + intercept_y, y, c = 'purple');
    sns.scatterplot(xname, yname, data=data)
    
   
    for idx, tdata in data.reset_index().sample(20).iterrows():
        tx = tdata[xname]
        ty = tdata[yname]
        
        tx_projected = rank_1_approximation.iloc[idx, 0]
        ty_projected = rank_1_approximation.iloc[idx, 1]
        plt.plot([tx, tx_projected], [ty, ty_projected], c='red') 
    
    plt.legend(['1D PCA Subspace', 'predicted ' + xname, 'predicted ' + yname])


plot_x_regression_y_regression_1d_approximation(body_data.drop(41)[["adiposity", "bicep"]])

/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

	0	1	2	3
96	8.015221	6.984689	55.999828	29.999819
97	2.584341	2.406224	3.982129	9.981131
98	3.619075	3.365328	5.970458	13.968808
99	4.167581	3.819511	11.975551	15.974185

	0	1	2	3
96	9.375531	8.319533	51.861441	35.390129
97	1.381452	1.225854	7.641603	5.214612
98	1.980513	1.757441	10.955353	7.475908
99	2.875538	2.551656	15.906251	10.854389

	mortality	fertility
country
Afghanistan	6.820	4.48
Albania	1.330	1.71
Algeria	2.390	2.71
Angola	8.310	5.62
Antigua and Barbuda	0.816	2.04

	mortality	fertility
country
Afghanistan	6.820	4.48
Albania	1.330	1.71
Algeria	2.390	2.71
Angola	8.310	5.62
Antigua and Barbuda	0.816	2.04

	mortality	fertility
0	6.694067	4.660869
1	1.697627	1.182004
2	2.880467	2.005579
3	8.232160	5.731795
4	1.506198	1.048719

SVD Demo¶

Rank 1 Approximation of 2D Data, Data Centering¶

Interpreting Principal Components¶

Principal Components and Singular Values¶

Singular Values and Variance¶

Practical PCA¶

Congressional Vote Data¶

Let's look at only people who have voted more than 15 times lately¶

Exploring $V^T$¶

PCA vs. Linear Regression (Extra)¶

	% brozek fat	% siri fat	density	age	weight	height	adiposity	fat free weight	neck	chest	abdomen	hip	thigh	knee	ankle	bicep	forearm	wrist
0	12.6	12.3	1.0708	23	154.25	67.75	23.7	134.9	36.2	93.1	85.2	94.5	59.0	37.3	21.9	32.0	27.4	17.1
1	6.9	6.1	1.0853	22	173.25	72.25	23.4	161.3	38.5	93.6	83.0	98.7	58.7	37.3	23.4	30.5	28.9	18.2
2	24.6	25.3	1.0414	22	154.00	66.25	24.7	116.0	34.0	95.8	87.9	99.2	59.6	38.9	24.0	28.8	25.2	16.6
3	10.9	10.4	1.0751	26	184.75	72.25	24.9	164.7	37.4	101.8	86.4	101.2	60.1	37.3	22.8	32.4	29.4	18.2
4	27.8	28.7	1.0340	24	184.25	71.25	25.6	133.1	34.4	97.3	100.0	101.9	63.2	42.2	24.0	32.2	27.7	17.7

	density	abdomen
0	0.957134	85.201277
1	0.932425	83.001717
2	0.987458	87.900606
3	0.970613	86.401174
4	1.123369	99.998996

	density	abdomen
0	0.015226	-7.355952
1	0.029726	-9.555952
2	-0.014174	-4.655952
3	0.019526	-6.155952
4	-0.021574	7.444048

	% brozek fat	% siri fat	density	age	weight	height	adiposity	fat free weight	neck	chest	abdomen	hip	thigh	knee	ankle	bicep	forearm	wrist
0	-6.338492	-6.850794	0.015226	-21.884921	-24.674405	-2.39881	-1.736905	-8.813889	-1.792063	-7.724206	-7.355952	-5.404762	-0.405952	-1.290476	-1.202381	-0.273413	-1.263889	-1.129762
1	-12.038492	-13.050794	0.029726	-22.884921	-5.674405	2.10119	-2.036905	17.586111	0.507937	-7.224206	-9.555952	-1.204762	-0.705952	-1.290476	0.297619	-1.773413	0.236111	-0.029762
2	5.661508	6.149206	-0.014174	-22.884921	-24.924405	-3.89881	-0.736905	-27.713889	-3.992063	-5.024206	-4.655952	-0.704762	0.194048	0.309524	0.897619	-3.473413	-3.463889	-1.629762
3	-8.038492	-8.750794	0.019526	-18.884921	5.825595	2.10119	-0.536905	20.986111	-0.592063	0.975794	-6.155952	1.295238	0.694048	-1.290476	-0.302381	0.126587	0.736111	-0.029762
4	8.861508	9.549206	-0.021574	-20.884921	5.325595	1.10119	0.163095	-10.613889	-3.592063	-3.524206	7.444048	1.995238	3.794048	3.609524	0.897619	-0.073413	-0.963889	-0.529762

	% brozek fat	% siri fat	density	age	weight	height	adiposity	fat free weight	neck	chest	abdomen	hip	thigh	knee	ankle	bicep	forearm	wrist
0	13.732084	13.499443	1.067647	23.406671	155.524219	69.685017	22.382014	134.660313	35.750451	92.404761	81.663261	95.367676	57.832005	36.807149	22.471102	30.513755	27.871398	17.221299
1	6.949481	6.177696	1.084653	22.136786	173.262498	72.818426	22.848506	161.108343	37.281215	94.764794	81.952662	98.256188	59.629903	38.251464	23.494178	31.990484	28.799694	18.014750
2	24.388640	24.995400	1.041272	22.012988	155.515375	66.961498	24.013328	115.107332	35.327536	94.623409	87.487254	96.824325	59.191562	36.564650	22.049234	30.526265	27.727128	16.734867
3	10.535884	10.053321	1.076052	26.287526	185.315285	72.840641	24.486712	164.078362	38.187196	98.649630	87.304458	101.045858	61.263963	39.095345	23.826619	32.953935	29.261160	18.327856
4	27.324362	28.164091	1.034484	23.983222	184.352508	68.475540	26.973751	132.523943	37.446408	101.921954	96.240973	103.147363	63.286670	38.642499	23.123048	32.896046	28.991277	17.530191

	leg_id	first	last	gender	state	chamber	party	birthday
0	B000944	Sherrod	Brown	M	OH	sen	Democrat	1952-11-09
1	C000127	Maria	Cantwell	F	WA	sen	Democrat	1958-10-13
2	C000141	Benjamin	Cardin	M	MD	sen	Democrat	1943-10-05

	chamber	session	roll call	member	vote
0	House	1	555	A000374	Not Voting
1	House	1	555	A000370	Yes
2	House	1	555	A000055	No
3	House	1	555	A000371	Yes
4	House	1	555	A000372	No

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	1	0	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000367	0	0	0	0	0	0	0	0	0	0	...	0	1	1	1	1	0	1	1	0	1
A000369	1	1	0	0	1	1	0	1	1	1	...	0	0	1	0	0	1	0	0	1	0
A000370	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1
A000371	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	1	0	1	1	1	1

roll call	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	0.129252	-0.668934	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000367	-0.870748	-0.668934	-0.526077	-0.52381	-0.950113	-0.412698	-0.562358	-0.365079	-0.405896	-0.439909	...	-0.521542	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	-0.913832	0.496599
A000369	0.129252	0.331066	-0.526077	-0.52381	0.049887	0.587302	-0.562358	0.634921	0.594104	0.560091	...	-0.521542	-0.526077	0.045351	-0.521542	-0.519274	0.54195	-0.521542	-0.535147	0.086168	-0.503401
A000370	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599
A000371	0.129252	0.331066	0.473923	0.47619	0.049887	-0.412698	0.437642	-0.365079	-0.405896	-0.439909	...	0.478458	0.473923	0.045351	0.478458	0.480726	-0.45805	0.478458	0.464853	0.086168	0.496599

	member	pc1	pc2	leg_id	first	last	gender	state	chamber	party	birthday	pc1_jittered	pc2_jittered
0	A000055	3.061356	0.364191	A000055	Robert	Aderholt	M	AL	rep	Republican	1965-07-22	3.128055	0.343226
1	A000367	0.188870	-2.433565	A000367	Justin	Amash	M	MI	rep	Independent	1980-04-18	0.191451	-2.431195
2	A000369	2.844370	0.821619	A000369	Mark	Amodei	M	NV	rep	Republican	1958-06-12	2.766608	0.806439
3	A000370	-2.607536	0.127977	A000370	Alma	Adams	F	NC	rep	Democrat	1946-05-27	-2.512672	0.074580
4	A000371	-2.607536	0.127977	A000371	Pete	Aguilar	M	CA	rep	Democrat	1979-06-19	-2.537368	0.068060

	0	1	2	3
0	8.0	6.0	48.0	28.0
1	2.0	4.0	8.0	12.0
2	1.0	3.0	3.0	8.0
3	9.0	3.0	27.0	24.0

	0	1	2	3
0	8.0	6.0	48.0	28.0
1	2.0	4.0	8.0	12.0
2	1.0	3.0	3.0	8.0
3	9.0	3.0	27.0	24.0

	member	pc1	pc2	leg_id	first	last	gender	state	chamber	party	birthday	pc1_jittered	pc2_jittered
1	A000367	0.188870	-2.433565	A000367	Justin	Amash	M	MI	rep	Independent	1980-04-18	0.191451	-2.431195
6	A000374	1.247134	-3.533196	A000374	Ralph	Abraham	M	LA	rep	Republican	1954-09-16	1.210379	-3.521461
47	B001311	1.695651	-2.093912	B001311	Dan	Bishop	M	NC	rep	Republican	1964-07-01	1.686247	-1.982446
50	C000537	0.699636	-3.394179	C000537	James	Clyburn	M	SC	rep	Democrat	1940-07-21	0.687526	-3.212611
52	C000984	0.531789	-3.099044	C000984	Elijah	Cummings	M	MD	rep	Democrat	1951-01-18	0.583003	-3.131361
69	C001087	2.755060	-1.378193	C001087	Eric	Crawford	M	AR	rep	Republican	1966-01-22	2.756083	-1.270555
150	G000582	2.262007	-2.632452	G000582	Jenniffer	González-Colón	F	PR	rep	Republican	1976-08-05	2.259351	-2.656279
179	H001077	2.509474	-1.349023	H001077	Clay	Higgins	M	LA	rep	Republican	1961-08-24	2.446616	-1.346705
196	J000299	2.908823	-1.094618	J000299	Mike	Johnson	M	LA	rep	Republican	1972-01-30	3.061062	-1.116434
272	M001200	1.247134	-3.533196	M001200	A.	McEachin	M	VA	rep	Democrat	1961-10-10	1.221263	-3.559693
282	M001210	1.695651	-2.093912	M001210	Gregory	Murphy	M	NC	rep	Republican	1963-03-05	1.577282	-1.919511
285	N000147	1.247134	-3.533196	N000147	Eleanor	Norton	F	DC	rep	Democrat	1937-06-13	1.385627	-3.546085
298	P000197	1.247134	-3.533196	P000197	Nancy	Pelosi	F	CA	rep	Democrat	1940-03-26	1.217323	-3.454539
310	P000610	1.247134	-3.533196	P000610	Stacey	Plaskett	F	VI	rep	Democrat	1966-05-13	1.347833	-3.381807
323	R000577	-2.069671	-1.344435	R000577	Tim	Ryan	M	OH	rep	Democrat	1973-07-16	-1.992928	-1.423723
330	R000600	1.247134	-3.533196	R000600	Aumua	Amata	F	AS	rep	Republican	1947-12-29	1.255358	-3.499536
360	S001177	1.247134	-3.533196	S001177	Gregorio	Sablan	M	MP	rep	Democrat	1955-01-19	1.214066	-3.646465
374	S001204	1.247134	-3.533196	S001204	Michael	San Nicolas	M	GU	rep	Democrat	1981-01-30	1.395281	-3.476336

	515	516	517	518	519	520	521	522	523	524	...	546	547	548	549	550	551	552	553	554	555
member
A000055	0.101176	-0.691765	-0.543529	-0.541176	0.023529	0.581176	-0.581176	0.625882	0.588235	0.550588	...	-0.541176	-0.545882	0.014118	-0.538824	-0.536471	0.529412	-0.538824	-0.555294	0.056471	-0.522353
A000367	-0.898824	-0.691765	-0.543529	-0.541176	-0.976471	-0.418824	-0.581176	-0.374118	-0.411765	-0.449412	...	-0.541176	0.454118	0.014118	0.461176	0.463529	-0.470588	0.461176	0.444706	-0.943529	0.477647
A000369	0.101176	0.308235	-0.543529	-0.541176	0.023529	0.581176	-0.581176	0.625882	0.588235	0.550588	...	-0.541176	-0.545882	0.014118	-0.538824	-0.536471	0.529412	-0.538824	-0.555294	0.056471	-0.522353
A000370	0.101176	0.308235	0.456471	0.458824	0.023529	-0.418824	0.418824	-0.374118	-0.411765	-0.449412	...	0.458824	0.454118	0.014118	0.461176	0.463529	-0.470588	0.461176	0.444706	0.056471	0.477647
A000371	0.101176	0.308235	0.456471	0.458824	0.023529	-0.418824	0.418824	-0.374118	-0.411765	-0.449412	...	0.458824	0.454118	0.014118	0.461176	0.463529	-0.470588	0.461176	0.444706	0.056471	0.477647

	515	516	517	518	519	520	521	522	523	524	...	547	548	549	550	551	552	553	554	555	yes_no_count
member
A000055	1	0	0	0	1	1	0	1	1	1	...	0	1	0	0	1	0	0	1	0	40.0
A000367	0	0	0	0	0	0	0	0	0	0	...	1	1	1	1	0	1	1	0	1	41.0
A000369	1	1	0	0	1	1	0	1	1	1	...	0	1	0	0	1	0	0	1	0	41.0
A000370	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	0	1	1	1	1	41.0
A000371	1	1	1	1	1	0	1	0	0	0	...	1	1	1	1	0	1	1	1	1	41.0