import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(23) #kallisti
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
sns.set()
rectangle = pd.read_csv("rectangle_data.csv")
rectangle.tail(5)
width | height | area | perimeter | |
---|---|---|---|---|
95 | 8 | 5 | 40 | 26 |
96 | 8 | 7 | 56 | 30 |
97 | 1 | 4 | 4 | 10 |
98 | 1 | 6 | 6 | 14 |
99 | 2 | 6 | 12 | 16 |
Singular value decomposition is a numerical technique to automatically decompose matrix into two matrices. Given an input matrix X, SVD will return $U\Sigma$ and $V^T$ such that $ X = U \Sigma V^T $.
u, s, vt = np.linalg.svd(rectangle, full_matrices = False)
The SVD routine returns $U$ and $\Sigma$ as two separate variables. To compute $U \Sigma$ we simply write:
usig = u * s
The two key pieces of the decomposition are $U\Sigma$ and $V^T$, which we can think of for now as analogous to our 'data' and 'transformation operation' from our manual decomposition earlier.
As we did before with our manual decomposition, we can recover our original rectangle data by multiplying the left matrix $U\Sigma$ by the right matrix $V^T$.
pd.DataFrame(usig @ vt).head(4)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 8.0 | 6.0 | 48.0 | 28.0 |
1 | 2.0 | 4.0 | 8.0 | 12.0 |
2 | 1.0 | 3.0 | 3.0 | 8.0 |
3 | 9.0 | 3.0 | 27.0 | 24.0 |
np.set_printoptions(suppress=True)
usig
array([[-56.30926787, 4.08369641, -0.76796869, 0. ], [-13.92587137, -5.61592446, 1.59106852, -0. ], [ -7.3883695 , -5.11089273, 1.51352951, 0. ], [-36.84443159, -4.80005945, -3.80095908, -0. ], [-79.47260546, 13.00269827, 0.18659785, -0. ], [ -7.42135662, -5.11810904, -1.31469604, 0. ], [-13.95885849, -5.62314077, -1.23715703, 0. ], [-37.98955728, -1.31360807, -0.26071277, 0. ], [-15.6692269 , -9.65347804, -4.03555325, 0. ], [-25.44680915, -7.81311695, -3.92620778, -0. ], [-32.68750933, -2.52515864, 0.38769508, -0. ], [-53.89570114, 2.32104364, -2.20593631, -0. ], [-40.87803851, -1.86471027, -2.34708823, -0. ], [ -5.34289549, -3.98065864, 0.77963103, 0. ], [-28.2033419 , -8.33535389, 5.30031897, 0. ], [-64.00838899, 7.0615079 , 1.43570386, -0. ], [-43.29160524, -0.1020575 , -0.90912062, 0. ], [-13.92587137, -5.61592446, 1.59106852, -0. ], [-16.78136548, -6.15981034, 2.33291861, 0. ], [-18.43439279, -4.99433025, -0.47940371, 0. ], [-17.73119447, -10.78732028, -4.71576755, -0. ], [-48.59365319, 1.10949307, -1.55752846, -0. ], [-79.47260546, 13.00269827, 0.18659785, -0. ], [-29.48041607, -4.87776777, -2.47233693, 0. ], [-33.16242383, -4.83891361, -3.13664801, 0. ], [-44.08513177, 0.48789886, 0.51294377, 0. ], [-31.8939828 , -3.115115 , -1.03436931, 0. ], [-40.87803851, -1.86471027, -2.34708823, -0. ], [-22.57482149, -7.2656229 , -3.25394509, 0. ], [-22.90992708, -4.36551973, 0.27834961, -0. ], [ -9.48332419, -6.25195129, -1.99491034, 0. ], [-70.94697069, 9.44214673, -0.61091354, -0. ], [-13.95885849, -5.62314077, -1.23715703, 0. ], [ -5.34289549, -3.98065864, 0.77963103, 0. ], [-17.73119447, -10.78732028, -4.71576755, -0. ], [-27.40195494, -3.74031736, -0.37800985, -0. ], [-53.89570114, 2.32104364, -2.20593631, -0. ], [-62.37185523, 5.89241965, 2.83391341, -0. ], [-29.41444183, -4.86333514, 3.18411417, -0. ], [-29.48041607, -4.87776777, -2.47233693, 0. ], [-57.1027944 , 4.67365277, 0.6540957 , -0. ], [-14.75238503, -5.03318441, 0.18490736, 0. ], [-32.68750933, -2.52515864, 0.38769508, -0. ], [-29.48041607, -4.87776777, -2.47233693, 0. ], [-25.44680915, -7.81311695, -3.92620778, -0. ], [-19.63685958, -6.70369623, 3.0747687 , 0. ], [-15.6692269 , -9.65347804, -4.03555325, 0. ], [ -5.34289549, -3.98065864, 0.77963103, 0. ], [-22.08341343, -4.94825977, 1.68451077, 0. ], [-19.63685958, -6.70369623, 3.0747687 , 0. ], [-37.97306372, -1.30999991, 1.15340001, -0. ], [-19.79316204, -11.92116253, -5.39598185, -0. ], [-25.79840831, -4.91662193, -1.80802586, 0. ], [-19.63685958, -6.70369623, 3.0747687 , 0. ], [-56.27628075, 4.09091272, 2.06025686, 0. ], [-19.79316204, -11.92116253, -5.39598185, -0. ], [-25.34784779, -7.79146801, 4.55846888, 0. ], [-29.41444183, -4.86333514, 3.18411417, -0. ], [-37.98955728, -1.31360807, -0.26071277, 0. ], [-56.30926787, 4.08369641, -0.76796869, -0. ], [-28.31879682, -8.36061099, -4.59847046, -0. ], [-36.74547023, -4.77841051, 4.68371758, -0. ], [-16.78136548, -6.15981034, 2.33291861, 0. ], [ -9.43384352, -6.24112682, 2.24742798, 0. ], [-14.75238503, -5.03318441, 0.18490736, 0. ], [-28.2033419 , -8.33535389, 5.30031897, 0. ], [-31.8939828 , -3.115115 , -1.03436931, 0. ], [-16.83084616, -6.17063481, -1.90941971, -0. ], [-19.63685958, -6.70369623, 3.0747687 , 0. ], [-16.78136548, -6.15981034, 2.33291861, 0. ], [ -7.3883695 , -5.11089273, 1.51352951, 0. ], [ -7.3883695 , -5.11089273, 1.51352951, 0. ], [-50.18070626, 2.28940579, 1.28660032, 0. ], [-56.30926787, 4.08369641, -0.76796869, -0. ], [-22.49235369, -7.24758212, 3.81661879, 0. ], [-31.86099568, -3.10789868, 1.79385624, -0. ], [ -3.29742148, -2.85042454, 0.04573256, 0. ], [-79.47260546, 13.00269827, 0.18659785, -0. ], [-32.68750933, -2.52515864, 0.38769508, -0. ], [-36.74547023, -4.77841051, 4.68371758, -0. ], [-25.34784779, -7.79146801, 4.55846888, 0. ], [ -8.21488316, -4.52815268, 0.10736835, 0. ], [-33.16242383, -4.83891361, -3.13664801, 0. ], [-17.61573956, -10.76206319, 5.18302188, 0. ], [-11.54529176, -7.38579354, -2.67512465, -0. ], [-43.25861812, -0.09484119, 1.91910494, -0. ], [-53.89570114, 2.32104364, -2.20593631, -0. ], [ -7.42135662, -5.11810904, -1.31469604, 0. ], [-70.94697069, 9.44214673, -0.61091354, -0. ], [-48.59365319, 1.10949307, -1.55752846, -0. ], [-79.47260546, 13.00269827, 0.18659785, -0. ], [-27.38546138, -3.73670921, 1.03610292, -0. ], [-27.38546138, -3.73670921, 1.03610292, -0. ], [-11.47931753, -7.37136091, 2.98132646, 0. ], [-44.08513177, 0.48789886, 0.51294377, 0. ], [-48.59365319, 1.10949307, -1.55752846, -0. ], [-64.02488255, 7.05789975, 0.02159108, -0. ], [ -9.43384352, -6.24112682, 2.24742798, 0. ], [-13.52479154, -8.501595 , 3.71522493, -0. ], [-19.63685958, -6.70369623, 3.0747687 , -0. ]])
Naturally, we can instead use only the first 3 columns of usig and first 3 rows of vt and get back the exactly correct result. This si because the last column of usig is 0.
pd.DataFrame(usig[:, 0:3] @ vt[0:3, ]).head(4)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 8.0 | 6.0 | 48.0 | 28.0 |
1 | 2.0 | 4.0 | 8.0 | 12.0 |
2 | 1.0 | 3.0 | 3.0 | 8.0 |
3 | 9.0 | 3.0 | 27.0 | 24.0 |
If we use only the first 2 rows of usig and first 2 columns of vt, we end up with an imperfect reconstruction, but it's surprisingly not bad.
pd.DataFrame(usig[:, 0:2] @ vt[0:2, ]).tail(4)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
96 | 8.015221 | 6.984689 | 55.999828 | 29.999819 |
97 | 2.584341 | 2.406224 | 3.982129 | 9.981131 |
98 | 3.619075 | 3.365328 | 5.970458 | 13.968808 |
99 | 4.167581 | 3.819511 | 11.975551 | 15.974185 |
Even the one dimensional approximation is better than you might expect.
pd.DataFrame(usig[:, 0:1] @ vt[0:1, ]).tail(4)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
96 | 9.375531 | 8.319533 | 51.861441 | 35.390129 |
97 | 1.381452 | 1.225854 | 7.641603 | 5.214612 |
98 | 1.980513 | 1.757441 | 10.955353 | 7.475908 |
99 | 2.875538 | 2.551656 | 15.906251 | 10.854389 |
# Downloads from https://www.gapminder.org/data/
cm_path = 'child_mortality_0_5_year_olds_dying_per_1000_born.csv'
fe_path = 'children_per_woman_total_fertility.csv'
cm = pd.read_csv(cm_path).set_index('country')['2017'].to_frame()/10
fe = pd.read_csv(fe_path).set_index('country')['2017'].to_frame()
child_data = cm.merge(fe, left_index=True, right_index=True).dropna()
child_data.columns = ['mortality', 'fertility']
child_data.head()
mortality | fertility | |
---|---|---|
country | ||
Afghanistan | 6.820 | 4.48 |
Albania | 1.330 | 1.71 |
Algeria | 2.390 | 2.71 |
Angola | 8.310 | 5.62 |
Antigua and Barbuda | 0.816 | 2.04 |
def scatter14(data):
sns.scatterplot('mortality', 'fertility', data=data)
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2))
scatter14(child_data)
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
sns.scatterplot('mortality', 'fertility', data=child_data)
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='mortality', ylabel='fertility'>
u, s, vt = np.linalg.svd(child_data, full_matrices = False)
child_data_reconstructed = pd.DataFrame(u @ np.diag(s) @ vt, columns = ["mortality", "fertility"], index=child_data.index)
As we'd expect, the product of $U$, $\Sigma$, and $V^T$ recovers the original data perfectly.
child_data_reconstructed.head(5)
mortality | fertility | |
---|---|---|
country | ||
Afghanistan | 6.820 | 4.48 |
Albania | 1.330 | 1.71 |
Algeria | 2.390 | 2.71 |
Angola | 8.310 | 5.62 |
Antigua and Barbuda | 0.816 | 2.04 |
sns.scatterplot('mortality', 'fertility', data=child_data)
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='mortality', ylabel='fertility'>
What happens if we throw away a column of $U$, a singular value from $\Sigma$, and a row from $V^T$? In this case we end up with the "rank 1 approximation" of the data.
Looking at the data, we see that it does a surprisingly good job.
#Rather than manually invoking linalg.svd over and over, let's just
#define a function that does the rank approximation in one function call
def compute_rank_k_approximation(data, k):
u, s, vt = np.linalg.svd(data, full_matrices = False)
return pd.DataFrame(u[:, 0:k] @ np.diag(s[0:k]) @ vt[0:k, :], columns = data.columns)
#child_data_rank_1_approximation = pd.DataFrame(u[:, :-1] @ np.diag(s[:-1]) @ vt[:-1, :], columns = ["mortality", "fertility"], index=child_data.index)
child_data_rank_1_approximation = compute_rank_k_approximation(child_data, 1)
child_data_rank_1_approximation.head(5)
mortality | fertility | |
---|---|---|
0 | 6.694067 | 4.660869 |
1 | 1.697627 | 1.182004 |
2 | 2.880467 | 2.005579 |
3 | 8.232160 | 5.731795 |
4 | 1.506198 | 1.048719 |
By plotting the data in a 2D space, we can see what's going on. We're simply getting the original data projected on to some 1 dimensional subspace.
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='mortality', ylabel='fertility'>
There's one significant issue with our projection, which we can see by plotting both the original data and our reconstruction on the same axis. The issue is that the projection goes through the origin but our data has a non-zero y-intercept.
sns.scatterplot('mortality', 'fertility', data=child_data)
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='mortality', ylabel='fertility'>
While this y-intercept misalignment isn't terrible here, it can be really bad. For example, consider the 2D dataset below.
#http://jse.amstat.org/datasets/fat.txt
body_data = pd.read_fwf("fat.dat.txt", colspecs = [(9, 13), (17, 21), (23, 29), (35, 37),
(39, 45), (48, 53), (57, 61), (64, 69),
(73, 77), (80, 85), (88, 93), (96, 101),
(105, 109), (113, 117), (121, 125), (129, 133),
(137, 141), (145, 149)],
header=None, names = ["% brozek fat", "% siri fat", "density", "age",
"weight", "height", "adiposity", "fat free weight",
"neck", "chest", "abdomen", "hip", "thigh",
"knee", "ankle", "bicep", "forearm",
"wrist"])
#body_data = body_data.drop(41) #drop the weird record
body_data.head()
% brozek fat | % siri fat | density | age | weight | height | adiposity | fat free weight | neck | chest | abdomen | hip | thigh | knee | ankle | bicep | forearm | wrist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12.6 | 12.3 | 1.0708 | 23 | 154.25 | 67.75 | 23.7 | 134.9 | 36.2 | 93.1 | 85.2 | 94.5 | 59.0 | 37.3 | 21.9 | 32.0 | 27.4 | 17.1 |
1 | 6.9 | 6.1 | 1.0853 | 22 | 173.25 | 72.25 | 23.4 | 161.3 | 38.5 | 93.6 | 83.0 | 98.7 | 58.7 | 37.3 | 23.4 | 30.5 | 28.9 | 18.2 |
2 | 24.6 | 25.3 | 1.0414 | 22 | 154.00 | 66.25 | 24.7 | 116.0 | 34.0 | 95.8 | 87.9 | 99.2 | 59.6 | 38.9 | 24.0 | 28.8 | 25.2 | 16.6 |
3 | 10.9 | 10.4 | 1.0751 | 26 | 184.75 | 72.25 | 24.9 | 164.7 | 37.4 | 101.8 | 86.4 | 101.2 | 60.1 | 37.3 | 22.8 | 32.4 | 29.4 | 18.2 |
4 | 27.8 | 28.7 | 1.0340 | 24 | 184.25 | 71.25 | 25.6 | 133.1 | 34.4 | 97.3 | 100.0 | 101.9 | 63.2 | 42.2 | 24.0 | 32.2 | 27.7 | 17.7 |
density_and_abdomen = body_data[["density", "abdomen"]]
density_and_abdomen.head(5)
density | abdomen | |
---|---|---|
0 | 1.0708 | 85.2 |
1 | 1.0853 | 83.0 |
2 | 1.0414 | 87.9 |
3 | 1.0751 | 86.4 |
4 | 1.0340 | 100.0 |
If we look at the data, the rank 1 approximation looks at least vaguely sane from the table.
density_and_abdomen_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen, 1)
density_and_abdomen_rank_1_approximation.head(5)
density | abdomen | |
---|---|---|
0 | 0.957134 | 85.201277 |
1 | 0.932425 | 83.001717 |
2 | 0.987458 | 87.900606 |
3 | 0.970613 | 86.401174 |
4 | 1.123369 | 99.998996 |
But if we plot on 2D axes, we'll see that things are very wrong.
sns.scatterplot(x="density", y="abdomen", data=body_data)
<AxesSubplot:xlabel='density', ylabel='abdomen'>
density_and_abdomen_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen, 1)
sns.scatterplot(x="density", y="abdomen", data=body_data)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_rank_1_approximation);
Since the subspace that we're projecting on to is off and to the right, we end up with a bizarre result where our rank 1 approximation believes that density increases with abdomen size, even though the data shows the opposite.
To fix this issue, we should always start the SVD process by zero-centering our data. That is, for each column, we should subtract the mean of that column.
np.mean(density_and_abdomen, axis = 0)
density 1.055574 abdomen 92.555952 dtype: float64
density_and_abdomen_centered = density_and_abdomen - np.mean(density_and_abdomen, axis = 0)
density_and_abdomen_centered.head(5)
density | abdomen | |
---|---|---|
0 | 0.015226 | -7.355952 |
1 | 0.029726 | -9.555952 |
2 | -0.014174 | -4.655952 |
3 | 0.019526 | -6.155952 |
4 | -0.021574 | 7.444048 |
Now when we do the approximation, things work much better.
density_and_abdomen_centered_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen_centered, 1)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_centered)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_centered_rank_1_approximation);
Let's revisit our child mortality and maternal fertility data from before.
sns.scatterplot(data = child_data, x = "mortality", y= "fertility")
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
Since we're going to be doing SVD, let's make sure to center our data first.
np.mean(child_data, axis = 0)
mortality 3.034590 fertility 2.775956 dtype: float64
child_means = np.mean(child_data, axis = 0)
child_data_centered = child_data - child_means
sns.scatterplot(data = child_data_centered, x = "mortality", y= "fertility")
plt.xlim([-3, 11])
plt.ylim([-3, 11])
plt.xticks(np.arange(-3, 11, 2))
plt.yticks(np.arange(-3, 11, 2));
plt.gcf().savefig("mortality_fertility_centered.png", dpi=300, bbox_inches="tight")
Tie in with the manual computation slides.
rectangle_centered = rectangle - np.mean(rectangle, axis = 0)
np.var(rectangle_centered)
width 7.6891 height 5.3475 area 338.7316 perimeter 50.7904 dtype: float64
sum(np.var(rectangle_centered))
402.5586000000001
u, s, vt = np.linalg.svd(rectangle_centered, full_matrices = False)
u[0:5, :]
array([[-0.1339099 , 0.00592996, 0.03473374, -0.29683623], [ 0.08635418, -0.07951453, 0.01494809, 0.71147825], [ 0.11776646, -0.12896256, 0.08577428, -0.06531837], [-0.02727392, 0.1831771 , 0.01089507, -0.03105541], [-0.258806 , -0.09429499, 0.09027015, -0.03281766]])
np.diag(s)
array([[197.38807512, 0. , 0. , 0. ], [ 0. , 27.43462569, 0. , 0. ], [ 0. , 0. , 23.26261195, 0. ], [ 0. , 0. , 0. , 0. ]])
s**2/rectangle_centered.shape[0]
array([389.62052198, 7.52658687, 5.41149115, 0. ])
sum(s**2/rectangle_centered.shape[0])
402.5586000000005
Let's now step back and try to use PCA on our body measurement and congressional voting datasets.
body_data.head(5)
% brozek fat | % siri fat | density | age | weight | height | adiposity | fat free weight | neck | chest | abdomen | hip | thigh | knee | ankle | bicep | forearm | wrist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12.6 | 12.3 | 1.0708 | 23 | 154.25 | 67.75 | 23.7 | 134.9 | 36.2 | 93.1 | 85.2 | 94.5 | 59.0 | 37.3 | 21.9 | 32.0 | 27.4 | 17.1 |
1 | 6.9 | 6.1 | 1.0853 | 22 | 173.25 | 72.25 | 23.4 | 161.3 | 38.5 | 93.6 | 83.0 | 98.7 | 58.7 | 37.3 | 23.4 | 30.5 | 28.9 | 18.2 |
2 | 24.6 | 25.3 | 1.0414 | 22 | 154.00 | 66.25 | 24.7 | 116.0 | 34.0 | 95.8 | 87.9 | 99.2 | 59.6 | 38.9 | 24.0 | 28.8 | 25.2 | 16.6 |
3 | 10.9 | 10.4 | 1.0751 | 26 | 184.75 | 72.25 | 24.9 | 164.7 | 37.4 | 101.8 | 86.4 | 101.2 | 60.1 | 37.3 | 22.8 | 32.4 | 29.4 | 18.2 |
4 | 27.8 | 28.7 | 1.0340 | 24 | 184.25 | 71.25 | 25.6 | 133.1 | 34.4 | 97.3 | 100.0 | 101.9 | 63.2 | 42.2 | 24.0 | 32.2 | 27.7 | 17.7 |
u, s, vt = np.linalg.svd(body_data, full_matrices = False)
We see that some of our singular values capture more variance than others.
s
array([5020.52705061, 262.08505563, 254.54343189, 141.81242773, 59.87838101, 52.91069761, 41.65678998, 31.28233064, 26.02717253, 24.49195578, 22.11282152, 20.52806948, 18.66671232, 16.7184654 , 12.42776311, 7.7927621 , 1.9706795 , 0.27219341])
Or we can compute the fraction of the variance captured by each principal component. The result seems shocking at first, as our data appears to be effectively rank 1.
np.round(s**2 / sum(s**2), 2)
array([0.99, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])
This seems absurd, as clearly there are several variables that we expect to be show significant variation independent of each other, e.g. weight, height, and age. If this happens to you, it's probably because you forgot to center your data!
body_data_centered = body_data - np.mean(body_data, axis = 0)
body_data_centered.head(5)
% brozek fat | % siri fat | density | age | weight | height | adiposity | fat free weight | neck | chest | abdomen | hip | thigh | knee | ankle | bicep | forearm | wrist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -6.338492 | -6.850794 | 0.015226 | -21.884921 | -24.674405 | -2.39881 | -1.736905 | -8.813889 | -1.792063 | -7.724206 | -7.355952 | -5.404762 | -0.405952 | -1.290476 | -1.202381 | -0.273413 | -1.263889 | -1.129762 |
1 | -12.038492 | -13.050794 | 0.029726 | -22.884921 | -5.674405 | 2.10119 | -2.036905 | 17.586111 | 0.507937 | -7.224206 | -9.555952 | -1.204762 | -0.705952 | -1.290476 | 0.297619 | -1.773413 | 0.236111 | -0.029762 |
2 | 5.661508 | 6.149206 | -0.014174 | -22.884921 | -24.924405 | -3.89881 | -0.736905 | -27.713889 | -3.992063 | -5.024206 | -4.655952 | -0.704762 | 0.194048 | 0.309524 | 0.897619 | -3.473413 | -3.463889 | -1.629762 |
3 | -8.038492 | -8.750794 | 0.019526 | -18.884921 | 5.825595 | 2.10119 | -0.536905 | 20.986111 | -0.592063 | 0.975794 | -6.155952 | 1.295238 | 0.694048 | -1.290476 | -0.302381 | 0.126587 | 0.736111 | -0.029762 |
4 | 8.861508 | 9.549206 | -0.021574 | -20.884921 | 5.325595 | 1.10119 | 0.163095 | -10.613889 | -3.592063 | -3.524206 | 7.444048 | 1.995238 | 3.794048 | 3.609524 | 0.897619 | -0.073413 | -0.963889 | -0.529762 |
u, s, vt = np.linalg.svd(body_data_centered, full_matrices = False)
This time, we see that the top singular value is no longer as dominant.
s
array([586.34726844, 261.81908925, 167.11383507, 59.97080768, 53.33024862, 42.24150036, 34.80110599, 29.28559968, 25.54261691, 23.91816652, 20.83965308, 20.47360977, 18.54040972, 16.27886966, 12.37794822, 7.72010337, 1.95363897, 0.04487061])
Looking now at the fraction of the variance captured by each principal component, we see that the top 2 or 3 principal components capture quite a lot of the variance.
np.round(s**2 / sum(s**2), 2)
array([0.76, 0.15, 0.06, 0.01, 0.01, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])
We can also show this in the form of what is usually called a "scree plot".
plt.plot(s**2);
Thus, we expect that if we were to do a rank 3 approximation, we should get back data that's pretty close to where we started, as just those 3 dimensions capture 97% of the variance.
body_data_rank_3_approximation = compute_rank_k_approximation(body_data_centered, 3) + np.mean(body_data, axis = 0)
body_data_rank_3_approximation.head(5)
% brozek fat | % siri fat | density | age | weight | height | adiposity | fat free weight | neck | chest | abdomen | hip | thigh | knee | ankle | bicep | forearm | wrist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 13.732084 | 13.499443 | 1.067647 | 23.406671 | 155.524219 | 69.685017 | 22.382014 | 134.660313 | 35.750451 | 92.404761 | 81.663261 | 95.367676 | 57.832005 | 36.807149 | 22.471102 | 30.513755 | 27.871398 | 17.221299 |
1 | 6.949481 | 6.177696 | 1.084653 | 22.136786 | 173.262498 | 72.818426 | 22.848506 | 161.108343 | 37.281215 | 94.764794 | 81.952662 | 98.256188 | 59.629903 | 38.251464 | 23.494178 | 31.990484 | 28.799694 | 18.014750 |
2 | 24.388640 | 24.995400 | 1.041272 | 22.012988 | 155.515375 | 66.961498 | 24.013328 | 115.107332 | 35.327536 | 94.623409 | 87.487254 | 96.824325 | 59.191562 | 36.564650 | 22.049234 | 30.526265 | 27.727128 | 16.734867 |
3 | 10.535884 | 10.053321 | 1.076052 | 26.287526 | 185.315285 | 72.840641 | 24.486712 | 164.078362 | 38.187196 | 98.649630 | 87.304458 | 101.045858 | 61.263963 | 39.095345 | 23.826619 | 32.953935 | 29.261160 | 18.327856 |
4 | 27.324362 | 28.164091 | 1.034484 | 23.983222 | 184.352508 | 68.475540 | 26.973751 | 132.523943 | 37.446408 | 101.921954 | 96.240973 | 103.147363 | 63.286670 | 38.642499 | 23.123048 | 32.896046 | 28.991277 | 17.530191 |
body_data.head(5)
% brozek fat | % siri fat | density | age | weight | height | adiposity | fat free weight | neck | chest | abdomen | hip | thigh | knee | ankle | bicep | forearm | wrist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 12.6 | 12.3 | 1.0708 | 23 | 154.25 | 67.75 | 23.7 | 134.9 | 36.2 | 93.1 | 85.2 | 94.5 | 59.0 | 37.3 | 21.9 | 32.0 | 27.4 | 17.1 |
1 | 6.9 | 6.1 | 1.0853 | 22 | 173.25 | 72.25 | 23.4 | 161.3 | 38.5 | 93.6 | 83.0 | 98.7 | 58.7 | 37.3 | 23.4 | 30.5 | 28.9 | 18.2 |
2 | 24.6 | 25.3 | 1.0414 | 22 | 154.00 | 66.25 | 24.7 | 116.0 | 34.0 | 95.8 | 87.9 | 99.2 | 59.6 | 38.9 | 24.0 | 28.8 | 25.2 | 16.6 |
3 | 10.9 | 10.4 | 1.0751 | 26 | 184.75 | 72.25 | 24.9 | 164.7 | 37.4 | 101.8 | 86.4 | 101.2 | 60.1 | 37.3 | 22.8 | 32.4 | 29.4 | 18.2 |
4 | 27.8 | 28.7 | 1.0340 | 24 | 184.25 | 71.25 | 25.6 | 133.1 | 34.4 | 97.3 | 100.0 | 101.9 | 63.2 | 42.2 | 24.0 | 32.2 | 27.7 | 17.7 |
One very interesting thing we can do is try to plot the principal components themselves. In this case, let's plot only the first two.
u, s, vt = np.linalg.svd(body_data_centered, full_matrices = False)
pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
np.argmax(pcs[:, 0])
38
body_data.iloc[38, :]
% brozek fat 33.8000 % siri fat 35.2000 density 1.0202 age 46.0000 weight 363.1500 height 72.2500 adiposity 48.9000 fat free weight 240.5000 neck 51.2000 chest 136.2000 abdomen 148.1000 hip 147.7000 thigh 87.3000 knee 49.1000 ankle 29.6000 bicep 45.0000 forearm 29.0000 wrist 21.4000 Name: 38, dtype: float64
from ds100_utils import fetch_and_cache
import yaml
from datetime import datetime
#base_url = 'https://github.com/unitedstates/congress-legislators/raw/master/'
#legislators_path = 'legislators-current.yaml'
#f = fetch_and_cache(base_url + legislators_path, legislators_path)
legislators_data = yaml.safe_load(open('./data/legislators-2019.yaml'))
def to_date(s):
return datetime.strptime(s, '%Y-%m-%d')
legs = pd.DataFrame(
columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
data=[[x['id']['bioguide'],
x['name']['first'],
x['name']['last'],
x['bio']['gender'],
x['terms'][-1]['state'],
x['terms'][-1]['type'],
x['terms'][-1]['party'],
to_date(x['bio']['birthday'])] for x in legislators_data])
legs.head(3)
leg_id | first | last | gender | state | chamber | party | birthday | |
---|---|---|---|---|---|---|---|---|
0 | B000944 | Sherrod | Brown | M | OH | sen | Democrat | 1952-11-09 |
1 | C000127 | Maria | Cantwell | F | WA | sen | Democrat | 1958-10-13 |
2 | C000141 | Benjamin | Cardin | M | MD | sen | Democrat | 1943-10-05 |
# February 2019 House of Representatives roll call votes
# Downloaded using https://github.com/eyeseast/propublica-congress
votes = pd.read_csv('votes.csv')
votes = votes.astype({"roll call": str})
votes.head()
chamber | session | roll call | member | vote | |
---|---|---|---|---|---|
0 | House | 1 | 555 | A000374 | Not Voting |
1 | House | 1 | 555 | A000370 | Yes |
2 | House | 1 | 555 | A000055 | No |
3 | House | 1 | 555 | A000371 | Yes |
4 | House | 1 | 555 | A000372 | No |
def was_yes(s):
if s.iloc[0] == 'Yes':
return 1
else:
return 0
vote_pivot = votes.pivot_table(index='member',
columns='roll call',
values='vote',
aggfunc=was_yes,
fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()
(441, 41)
roll call | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | ... | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
member | |||||||||||||||||||||
A000055 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
A000367 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 1 |
A000369 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
A000370 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
A000371 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 |
5 rows × 41 columns
vote_pivot_centered = vote_pivot - np.mean(vote_pivot, axis = 0)
vote_pivot_centered.head(5)
roll call | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | ... | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
member | |||||||||||||||||||||
A000055 | 0.129252 | -0.668934 | -0.526077 | -0.52381 | 0.049887 | 0.587302 | -0.562358 | 0.634921 | 0.594104 | 0.560091 | ... | -0.521542 | -0.526077 | 0.045351 | -0.521542 | -0.519274 | 0.54195 | -0.521542 | -0.535147 | 0.086168 | -0.503401 |
A000367 | -0.870748 | -0.668934 | -0.526077 | -0.52381 | -0.950113 | -0.412698 | -0.562358 | -0.365079 | -0.405896 | -0.439909 | ... | -0.521542 | 0.473923 | 0.045351 | 0.478458 | 0.480726 | -0.45805 | 0.478458 | 0.464853 | -0.913832 | 0.496599 |
A000369 | 0.129252 | 0.331066 | -0.526077 | -0.52381 | 0.049887 | 0.587302 | -0.562358 | 0.634921 | 0.594104 | 0.560091 | ... | -0.521542 | -0.526077 | 0.045351 | -0.521542 | -0.519274 | 0.54195 | -0.521542 | -0.535147 | 0.086168 | -0.503401 |
A000370 | 0.129252 | 0.331066 | 0.473923 | 0.47619 | 0.049887 | -0.412698 | 0.437642 | -0.365079 | -0.405896 | -0.439909 | ... | 0.478458 | 0.473923 | 0.045351 | 0.478458 | 0.480726 | -0.45805 | 0.478458 | 0.464853 | 0.086168 | 0.496599 |
A000371 | 0.129252 | 0.331066 | 0.473923 | 0.47619 | 0.049887 | -0.412698 | 0.437642 | -0.365079 | -0.405896 | -0.439909 | ... | 0.478458 | 0.473923 | 0.045351 | 0.478458 | 0.480726 | -0.45805 | 0.478458 | 0.464853 | 0.086168 | 0.496599 |
5 rows × 41 columns
u, s, vt = np.linalg.svd(vote_pivot_centered, full_matrices = False)
np.round(s**2 / sum(s**2), 2)
array([0.8 , 0.05, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])
plt.plot(s**2)
[<matplotlib.lines.Line2D at 0x7f943c275400>]
pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
vote2d = pd.DataFrame({
'member': vote_pivot.index,
'pc1': pcs[:, 0],
'pc2': pcs[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')
vote2d[vote2d['pc1'] < 0]['party'].value_counts()
Democrat 231 Name: party, dtype: int64
#top right only
vote2d.query('pc2 > -2 and pc1 > 0')['party'].value_counts()
Republican 194 Name: party, dtype: int64
sns.scatterplot(x="pc1", y="pc2", hue="party", data = vote2d);
vote2d['pc1_jittered'] = vote2d['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
vote2d['pc2_jittered'] = vote2d['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
sns.scatterplot(x="pc1_jittered", y="pc2_jittered", hue="party", data = vote2d);
vote2d.head(5)
member | pc1 | pc2 | leg_id | first | last | gender | state | chamber | party | birthday | pc1_jittered | pc2_jittered | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A000055 | 3.061356 | 0.364191 | A000055 | Robert | Aderholt | M | AL | rep | Republican | 1965-07-22 | 3.128055 | 0.343226 |
1 | A000367 | 0.188870 | -2.433565 | A000367 | Justin | Amash | M | MI | rep | Independent | 1980-04-18 | 0.191451 | -2.431195 |
2 | A000369 | 2.844370 | 0.821619 | A000369 | Mark | Amodei | M | NV | rep | Republican | 1958-06-12 | 2.766608 | 0.806439 |
3 | A000370 | -2.607536 | 0.127977 | A000370 | Alma | Adams | F | NC | rep | Democrat | 1946-05-27 | -2.512672 | 0.074580 |
4 | A000371 | -2.607536 | 0.127977 | A000371 | Pete | Aguilar | M | CA | rep | Democrat | 1979-06-19 | -2.537368 | 0.068060 |
vote2d[vote2d['pc1'] > 0]['party'].value_counts()
Republican 199 Democrat 8 Independent 1 Name: party, dtype: int64
vote2d[vote2d['pc2'] < -1]
member | pc1 | pc2 | leg_id | first | last | gender | state | chamber | party | birthday | pc1_jittered | pc2_jittered | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | A000367 | 0.188870 | -2.433565 | A000367 | Justin | Amash | M | MI | rep | Independent | 1980-04-18 | 0.191451 | -2.431195 |
6 | A000374 | 1.247134 | -3.533196 | A000374 | Ralph | Abraham | M | LA | rep | Republican | 1954-09-16 | 1.210379 | -3.521461 |
47 | B001311 | 1.695651 | -2.093912 | B001311 | Dan | Bishop | M | NC | rep | Republican | 1964-07-01 | 1.686247 | -1.982446 |
50 | C000537 | 0.699636 | -3.394179 | C000537 | James | Clyburn | M | SC | rep | Democrat | 1940-07-21 | 0.687526 | -3.212611 |
52 | C000984 | 0.531789 | -3.099044 | C000984 | Elijah | Cummings | M | MD | rep | Democrat | 1951-01-18 | 0.583003 | -3.131361 |
69 | C001087 | 2.755060 | -1.378193 | C001087 | Eric | Crawford | M | AR | rep | Republican | 1966-01-22 | 2.756083 | -1.270555 |
150 | G000582 | 2.262007 | -2.632452 | G000582 | Jenniffer | González-Colón | F | PR | rep | Republican | 1976-08-05 | 2.259351 | -2.656279 |
179 | H001077 | 2.509474 | -1.349023 | H001077 | Clay | Higgins | M | LA | rep | Republican | 1961-08-24 | 2.446616 | -1.346705 |
196 | J000299 | 2.908823 | -1.094618 | J000299 | Mike | Johnson | M | LA | rep | Republican | 1972-01-30 | 3.061062 | -1.116434 |
272 | M001200 | 1.247134 | -3.533196 | M001200 | A. | McEachin | M | VA | rep | Democrat | 1961-10-10 | 1.221263 | -3.559693 |
282 | M001210 | 1.695651 | -2.093912 | M001210 | Gregory | Murphy | M | NC | rep | Republican | 1963-03-05 | 1.577282 | -1.919511 |
285 | N000147 | 1.247134 | -3.533196 | N000147 | Eleanor | Norton | F | DC | rep | Democrat | 1937-06-13 | 1.385627 | -3.546085 |
298 | P000197 | 1.247134 | -3.533196 | P000197 | Nancy | Pelosi | F | CA | rep | Democrat | 1940-03-26 | 1.217323 | -3.454539 |
310 | P000610 | 1.247134 | -3.533196 | P000610 | Stacey | Plaskett | F | VI | rep | Democrat | 1966-05-13 | 1.347833 | -3.381807 |
323 | R000577 | -2.069671 | -1.344435 | R000577 | Tim | Ryan | M | OH | rep | Democrat | 1973-07-16 | -1.992928 | -1.423723 |
330 | R000600 | 1.247134 | -3.533196 | R000600 | Aumua | Amata | F | AS | rep | Republican | 1947-12-29 | 1.255358 | -3.499536 |
360 | S001177 | 1.247134 | -3.533196 | S001177 | Gregorio | Sablan | M | MP | rep | Democrat | 1955-01-19 | 1.214066 | -3.646465 |
374 | S001204 | 1.247134 | -3.533196 | S001204 | Michael | San Nicolas | M | GU | rep | Democrat | 1981-01-30 | 1.395281 | -3.476336 |
df = votes[votes['member'].isin(vote2d[vote2d['pc2'] < -1]['member'])]
df.groupby(['member', 'vote']).size()
member vote A000367 No 31 Yes 10 A000374 Not Voting 41 B001311 No 17 Yes 7 C000537 No 1 Not Voting 37 Yes 3 C000984 No 4 Not Voting 32 Yes 5 C001087 No 6 Not Voting 23 Yes 12 G000582 Not Voting 1 Yes 6 H001077 No 14 Not Voting 15 Yes 12 J000299 No 20 Not Voting 7 Yes 14 M001200 Not Voting 41 M001210 No 16 Not Voting 1 Yes 7 N000147 No 6 Not Voting 1 P000197 Speaker 41 P000610 No 7 R000577 No 11 Not Voting 8 Yes 22 R000600 Not Voting 7 S001177 No 7 S001204 No 6 Not Voting 1 dtype: int64
legs.query("leg_id == 'A000367'")
leg_id | first | last | gender | state | chamber | party | birthday | |
---|---|---|---|---|---|---|---|---|
33 | A000367 | Justin | Amash | M | MI | rep | Independent | 1980-04-18 |
votes.head(5)
chamber | session | roll call | member | vote | |
---|---|---|---|---|---|
0 | House | 1 | 555 | A000374 | Not Voting |
1 | House | 1 | 555 | A000370 | Yes |
2 | House | 1 | 555 | A000055 | No |
3 | House | 1 | 555 | A000371 | Yes |
4 | House | 1 | 555 | A000372 | No |
num_yes_or_no_votes_per_member = votes.query("vote == 'Yes' or vote == 'No'").groupby("member").size()
num_yes_or_no_votes_per_member.head(5)
member A000055 40 A000367 41 A000369 41 A000370 41 A000371 41 dtype: int64
vote_pivot_with_yes_no_count = vote_pivot.merge(num_yes_or_no_votes_per_member.to_frame(), left_index = True, right_index = True, how="outer", ).fillna(0)
vote_pivot_with_yes_no_count = vote_pivot_with_yes_no_count.rename(columns = {0: 'yes_no_count'})
vote_pivot_with_yes_no_count.head(5)
515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | ... | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | yes_no_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
member | |||||||||||||||||||||
A000055 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 40.0 |
A000367 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 41.0 |
A000369 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 41.0 |
A000370 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 41.0 |
A000371 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | ... | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 41.0 |
5 rows × 42 columns
regulars = vote_pivot_with_yes_no_count.query('yes_no_count >= 30')
regulars = regulars.drop('yes_no_count', 1)
regulars.shape
/tmp/ipykernel_190/3052268403.py:2: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only. regulars = regulars.drop('yes_no_count', 1)
(425, 41)
regulars_centered = regulars - np.mean(regulars, axis = 0)
regulars_centered.head(5)
515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | ... | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
member | |||||||||||||||||||||
A000055 | 0.101176 | -0.691765 | -0.543529 | -0.541176 | 0.023529 | 0.581176 | -0.581176 | 0.625882 | 0.588235 | 0.550588 | ... | -0.541176 | -0.545882 | 0.014118 | -0.538824 | -0.536471 | 0.529412 | -0.538824 | -0.555294 | 0.056471 | -0.522353 |
A000367 | -0.898824 | -0.691765 | -0.543529 | -0.541176 | -0.976471 | -0.418824 | -0.581176 | -0.374118 | -0.411765 | -0.449412 | ... | -0.541176 | 0.454118 | 0.014118 | 0.461176 | 0.463529 | -0.470588 | 0.461176 | 0.444706 | -0.943529 | 0.477647 |
A000369 | 0.101176 | 0.308235 | -0.543529 | -0.541176 | 0.023529 | 0.581176 | -0.581176 | 0.625882 | 0.588235 | 0.550588 | ... | -0.541176 | -0.545882 | 0.014118 | -0.538824 | -0.536471 | 0.529412 | -0.538824 | -0.555294 | 0.056471 | -0.522353 |
A000370 | 0.101176 | 0.308235 | 0.456471 | 0.458824 | 0.023529 | -0.418824 | 0.418824 | -0.374118 | -0.411765 | -0.449412 | ... | 0.458824 | 0.454118 | 0.014118 | 0.461176 | 0.463529 | -0.470588 | 0.461176 | 0.444706 | 0.056471 | 0.477647 |
A000371 | 0.101176 | 0.308235 | 0.456471 | 0.458824 | 0.023529 | -0.418824 | 0.418824 | -0.374118 | -0.411765 | -0.449412 | ... | 0.458824 | 0.454118 | 0.014118 | 0.461176 | 0.463529 | -0.470588 | 0.461176 | 0.444706 | 0.056471 | 0.477647 |
5 rows × 41 columns
u, s, vt = np.linalg.svd(regulars_centered, full_matrices = False)
np.round(s**2 / sum(s**2), 2)
array([0.84, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ])
pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
vote2d = pd.DataFrame({
'member': regulars_centered.index,
'pc1': pcs[:, 0],
'pc2': pcs[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')
vote2d['pc1_jittered'] = vote2d['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
vote2d['pc2_jittered'] = vote2d['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
sns.scatterplot(x="pc1_jittered", y="pc2_jittered", hue="party", data = vote2d);
We can also look at Vt directly to try to gain insight into why each component is as it is.
num_votes = vt.shape[1]
votes = regulars.columns
def plot_pc(k):
plt.bar(votes, vt[k, :], alpha=0.7)
plt.xticks(votes, rotation=90);
with plt.rc_context({"figure.figsize": (12, 4)}):
#plot_pc(0)
plot_pc(1)
Using SVD for PCA may lead to hard to interpret $V^T$ matrices for two reasons:
There are exist other methods for doing PCA, e.g. Sparse PCA, that will try to lead to an interpretable version of $V^T$.
Let's compare PCA to Linear Regression
Returning to our child mortality data from before, if we zero-center the child data, we see get back a better result. Note that we have to add back in the mean of each column to get things back into the right units.
means = np.mean(child_data, axis = 0)
child_data_centered = child_data - np.mean(child_data, axis = 0)
child_data_rank_1_approximation = compute_rank_k_approximation(child_data_centered, 1) + means
sns.scatterplot('mortality', 'fertility', data=child_data)
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)
plt.legend(['data', 'rank 1 approx'])
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
We can also give our rank 1 approximation as a line, showing the 1D subspace (in black) that our data is being projected onto.
sns.scatterplot('mortality', 'fertility', data=child_data)
sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color='black')
plt.legend(['rank 1 approx', 'data']);
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
This plot probably brings to mind linear regression from Data 8. But PCA is NOT the same thing linear regression. Let's plot the regression lines for this data for comparison. Recall that the regression line gives us, e.g. the best possible linear prediction of the fertility given the mortality.
x, y = child_data['mortality'], child_data['fertility']
slope_x, intercept_x = np.polyfit(x, y, 1) # simple linear regression
scatter14(child_data)
plt.plot(x, slope_x * x + intercept_x, c = 'g')
for _, row in child_data.sample(20).iterrows():
tx, ty = row['mortality'], row['fertility']
plt.plot([tx, tx], [slope_x * tx + intercept_x, ty], c='red')
plt.legend(['predicted fertility']);
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
In the plot above, the green regression line given minimizes the sum of the squared errors, given as red vertical lines.
We could also do the opposite thing, and try to predict fertility from mortality.
x, y = child_data['mortality'], child_data['fertility']
slope_y, intercept_y = np.polyfit(y, x, 1) # simple linear regression
scatter14(child_data)
plt.plot(slope_y * y + intercept_y, y, c = 'purple')
for _, row in child_data.sample(20).iterrows():
tx, ty = row['mortality'], row['fertility']
plt.plot([tx, slope_y * ty + intercept_y], [ty, ty], c='red')
plt.legend(['predicted mortality']);
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
In the plot above, the green regression line given minimizes the sum of the squared errors, given as red horizontal lines.
Plotting the two regression lines and the 1D subspace chosen by PCA, we see that all 3 are distinct!
sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot('mortality', 'fertility', data=child_data)
plt.legend(['rank 1 approx', 'predicted fertility', 'predicted mortality'])
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
Given that the green line minimizes the "vertical error" and the purple line minimizes the "horizontal error". You might wonder what the black line minimizes. It turns out, it minimizes the "diagonal" error, i.e. the error in the direction perpendicular to itself.
sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot('mortality', 'fertility', data=child_data)
for idx, tdata in child_data.reset_index().sample(20).iterrows():
tx = tdata["mortality"]
ty = tdata["fertility"]
tx_projected = child_data_rank_1_approximation.iloc[idx, 0]
ty_projected = child_data_rank_1_approximation.iloc[idx, 1]
plt.plot([tx, tx_projected], [ty, ty_projected], c='red')
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
plt.legend(['rank 1 approx', 'predicted fertility', 'predicted mortality']);
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
The function in the following cell makes it easy to make similar plots for whatever dataset you might be interested in.
def plot_x_regression_y_regression_1d_approximation(data):
xname = data.columns[0]
yname = data.columns[1]
x, y = data[xname], data[yname]
slope_x, intercept_x = np.polyfit(x, y, 1) # simple linear regression
x, y = data[xname], data[yname]
slope_y, intercept_y = np.polyfit(y, x, 1) # simple linear regression
means = np.mean(data, axis = 0)
rank_1_approximation = compute_rank_k_approximation(data - means, 1) + means
sns.lineplot(x=xname, y=yname, data=rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot(xname, yname, data=data)
for idx, tdata in data.reset_index().sample(20).iterrows():
tx = tdata[xname]
ty = tdata[yname]
tx_projected = rank_1_approximation.iloc[idx, 0]
ty_projected = rank_1_approximation.iloc[idx, 1]
plt.plot([tx, tx_projected], [ty, ty_projected], c='red')
plt.legend(['1D PCA Subspace', 'predicted ' + xname, 'predicted ' + yname])
plot_x_regression_y_regression_1d_approximation(body_data.drop(41)[["adiposity", "bicep"]])
/srv/conda/envs/notebook/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(