by Josh Hug (Fall 2019)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(23) #kallisti
plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
sns.set()
def compute_rank_k_approximation(data, k):
u, s, vt = np.linalg.svd(data, full_matrices = False)
return pd.DataFrame(u[:, 0:k] @ np.diag(s[0:k]) @ vt[0:k, :], columns = data.columns)
# Downloads from https://www.gapminder.org/data/
cm_path = 'child_mortality_0_5_year_olds_dying_per_1000_born.csv'
fe_path = 'children_per_woman_total_fertility.csv'
cm = pd.read_csv(cm_path).set_index('country')['2017'].to_frame()/10
fe = pd.read_csv(fe_path).set_index('country')['2017'].to_frame()
child_data = cm.merge(fe, left_index=True, right_index=True).dropna()
child_data.columns = ['mortality', 'fertility']
child_data.head()
def scatter14(data):
sns.scatterplot('mortality', 'fertility', data=data)
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2))
scatter14(child_data)
sns.scatterplot('mortality', 'fertility', data=child_data)
u, s, vt = np.linalg.svd(child_data, full_matrices = False)
child_data_reconstructed = pd.DataFrame(u @ np.diag(s) @ vt, columns = ["mortality", "fertility"], index=child_data.index)
As we'd expect, the product of $U$, $\Sigma$, and $V^T$ recovers the original data perfectly.
child_data_reconstructed.head(5)
sns.scatterplot('mortality', 'fertility', data=child_data)
What happens if we throw away a column of $U$, a singular value from $\Sigma$, and a row from $V^T$? In this case we end up with the "rank 1 approximation" of the data.
Looking at the data, we see that it does a surprisingly good job.
#child_data_rank_1_approximation = pd.DataFrame(u[:, :-1] @ np.diag(s[:-1]) @ vt[:-1, :], columns = ["mortality", "fertility"], index=child_data.index)
child_data_rank_1_approximation = compute_rank_k_approximation(child_data, 1)
child_data_rank_1_approximation.head(5)
By plotting the data in a 2D space, we can see what's going on. We're simply getting the original data projected on to some 1 dimensional subspace.
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)
There's one significant issue with our projection, which we can see by plotting both the original data and our reconstruction on the same axis. The issue is that the projection goes through the origin but our data has a non-zero y-intercept.
sns.scatterplot('mortality', 'fertility', data=child_data)
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)
While this y-intercept misalignment isn't terrible here, it can be really bad. For example, consider the 2D dataset below (from our body measurements dataset from the previous lecture).
#http://jse.amstat.org/datasets/fat.txt
body_data = pd.read_fwf("fat.dat.txt", colspecs = [(9, 13), (17, 21), (23, 29), (35, 37),
(39, 45), (48, 53), (57, 61), (64, 69),
(73, 77), (80, 85), (88, 93), (96, 101),
(105, 109), (113, 117), (121, 125), (129, 133),
(137, 141), (145, 149)],
header=None, names = ["% brozek fat", "% siri fat", "density", "age",
"weight", "height", "adiposity", "fat free weight",
"neck", "chest", "abdomen", "hip", "thigh",
"knee", "ankle", "bicep", "forearm",
"wrist"])
#body_data = body_data.drop(41) #drop the weird record
body_data.head()
density_and_abdomen = body_data[["density", "abdomen"]]
density_and_abdomen.head(5)
If we look at the data, the rank 1 approximation looks at least vaguely sane from the table.
density_and_abdomen_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen, 1)
density_and_abdomen_rank_1_approximation.head(5)
But if we plot on 2D axes, we'll see that things are very wrong.
sns.scatterplot(x="density", y="abdomen", data=body_data)
density_and_abdomen_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen, 1)
sns.scatterplot(x="density", y="abdomen", data=body_data)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_rank_1_approximation);
Since the subspace that we're projecting on to is off and to the right, we end up with a bizarre result where our rank 1 approximation believes that density increases with abdomen size, even though the data shows the opposite.
To fix this issue, we should always start the SVD process by zero-centering our data. That is, for each column, we should subtract the mean of that column.
np.mean(density_and_abdomen, axis = 0)
density_and_abdomen_centered = density_and_abdomen - np.mean(density_and_abdomen, axis = 0)
density_and_abdomen_centered.head(5)
Now when we do the approximation, things work much better.
density_and_abdomen_centered_rank_1_approximation = compute_rank_k_approximation(density_and_abdomen_centered, 1)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_centered)
sns.scatterplot(x="density", y="abdomen", data=density_and_abdomen_centered_rank_1_approximation);
Let's revisit our child mortality and maternal fertility data from before.
sns.scatterplot(data = child_data, x = "mortality", y= "fertility")
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
Since we're going to be doing SVD, let's make sure to center our data first.
np.mean(child_data, axis = 0)
child_means = np.mean(child_data, axis = 0)
child_data_centered = child_data - child_means
sns.scatterplot(data = child_data_centered, x = "mortality", y= "fertility")
plt.xlim([-3, 11])
plt.ylim([-3, 11])
plt.xticks(np.arange(-3, 11, 2))
plt.yticks(np.arange(-3, 11, 2));
# plt.gcf().savefig("mortality_fertility_centered.png", dpi=300, bbox_inches="tight")
Tie in with the manual computation slides.
Now we'll finally turn to trying to understand what the principle components and the low rank approximations actually mean!
Returning to our child mortality data from before, if we zero-center the child data, we see get back a better result. Note that we have to add back in the mean of each column to get things back into the right units.
means = np.mean(child_data, axis = 0)
child_data_centered = child_data - np.mean(child_data, axis = 0)
child_data_rank_1_approximation = compute_rank_k_approximation(child_data_centered, 1) + means
sns.scatterplot('mortality', 'fertility', data=child_data)
sns.scatterplot('mortality', 'fertility', data=child_data_rank_1_approximation)
plt.legend(['data', 'rank 1 approx'])
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
We can also give our rank 1 approximation as a line, showing the 1D subspace (in black) that our data is being projected onto.
sns.scatterplot('mortality', 'fertility', data=child_data)
sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color='black')
plt.legend(['rank 1 approx', 'data']);
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
This plot probably brings to mind linear regression from Data 8. But PCA is NOT the same thing linear regression. Let's plot the regression lines for this data for comparison. Recall that the regression line gives us, e.g. the best possible linear prediction of the fertility given the mortality.
x, y = child_data['mortality'], child_data['fertility']
slope_x, intercept_x = np.polyfit(x, y, 1) # simple linear regression
scatter14(child_data)
plt.plot(x, slope_x * x + intercept_x, c = 'g')
for _, row in child_data.sample(20).iterrows():
tx, ty = row['mortality'], row['fertility']
plt.plot([tx, tx], [slope_x * tx + intercept_x, ty], c='red')
plt.legend(['predicted fertility']);
In the plot above, the green regression line given minimizes the sum of the squared errors, given as red vertical lines.
We could also do the opposite thing, and try to predict fertility from mortality.
x, y = child_data['mortality'], child_data['fertility']
slope_y, intercept_y = np.polyfit(y, x, 1) # simple linear regression
scatter14(child_data)
plt.plot(slope_y * y + intercept_y, y, c = 'purple')
for _, row in child_data.sample(20).iterrows():
tx, ty = row['mortality'], row['fertility']
plt.plot([tx, slope_y * ty + intercept_y], [ty, ty], c='red')
plt.legend(['predicted mortality']);
In the plot above, the green regression line given minimizes the sum of the squared errors, given as red horizontal lines.
Plotting the two regression lines and the 1D subspace chosen by PCA, we see that all 3 are distinct!
sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot('mortality', 'fertility', data=child_data)
plt.legend(['rank 1 approx', 'predicted fertility', 'predicted mortality'])
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
Given that the green line minimizes the "vertical error" and the purple line minimizes the "horizontal error". You might wonder what the black line minimizes. It turns out, it minimizes the "diagonal" error, i.e. the error in the direction perpendicular to itself.
sns.lineplot('mortality', 'fertility', data=child_data_rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot('mortality', 'fertility', data=child_data)
for idx, tdata in child_data.reset_index().sample(20).iterrows():
tx = tdata["mortality"]
ty = tdata["fertility"]
tx_projected = child_data_rank_1_approximation.iloc[idx, 0]
ty_projected = child_data_rank_1_approximation.iloc[idx, 1]
plt.plot([tx, tx_projected], [ty, ty_projected], c='red')
plt.xlim([0, 14])
plt.ylim([0, 14])
plt.xticks(np.arange(0, 14, 2))
plt.yticks(np.arange(0, 14, 2));
plt.legend(['rank 1 approx', 'predicted fertility', 'predicted mortality']);
The function in the following cell makes it easy to make similar plots for whatever dataset you might be interested in.
def plot_x_regression_y_regression_1d_approximation(data):
xname = data.columns[0]
yname = data.columns[1]
x, y = data[xname], data[yname]
slope_x, intercept_x = np.polyfit(x, y, 1) # simple linear regression
x, y = data[xname], data[yname]
slope_y, intercept_y = np.polyfit(y, x, 1) # simple linear regression
means = np.mean(data, axis = 0)
rank_1_approximation = compute_rank_k_approximation(data - means, 1) + means
sns.lineplot(x=xname, y=yname, data=rank_1_approximation, color="black")
plt.plot(x, slope_x * x + intercept_x, c = 'g')
plt.plot(slope_y * y + intercept_y, y, c = 'purple');
sns.scatterplot(xname, yname, data=data)
for idx, tdata in data.reset_index().sample(20).iterrows():
tx = tdata[xname]
ty = tdata[yname]
tx_projected = rank_1_approximation.iloc[idx, 0]
ty_projected = rank_1_approximation.iloc[idx, 1]
plt.plot([tx, tx_projected], [ty, ty_projected], c='red')
plt.legend(['1D PCA Subspace', 'predicted ' + xname, 'predicted ' + yname])
plot_x_regression_y_regression_1d_approximation(body_data.drop(41)[["adiposity", "bicep"]])
rectangle = pd.read_csv('rectangle_data.csv')
rectangle_centered = rectangle - np.mean(rectangle, axis = 0)
np.var(rectangle_centered)
sum(np.var(rectangle_centered))
u, s, vt = np.linalg.svd(rectangle_centered, full_matrices = False)
u[0:5, :]
np.diag(s)
s**2/rectangle_centered.shape[0]
sum(s**2/rectangle_centered.shape[0])
Let's now step back and try to use PCA on our body measurement and congressional voting datasets.
body_data.head(5)
u, s, vt = np.linalg.svd(body_data, full_matrices = False)
We see that some of our singular values capture more variance than others.
s
Or we can compute the fraction of the variance captured by each principal component. The result seems shocking at first, as our data appears to be effectively rank 1.
np.round(s**2 / sum(s**2), 2)
This seems absurd, as clearly there are several variables that we expect to be show significant variation independent of each other, e.g. weight, height, and age. If this happens to you, it's probably because you forgot to center your data!
body_data_centered = body_data - np.mean(body_data, axis = 0)
body_data_centered.head(5)
u, s, vt = np.linalg.svd(body_data_centered, full_matrices = False)
This time, we see that the top singular value is no longer as dominant.
s
Looking now at the fraction of the variance captured by each principal component, we see that the top 2 or 3 principal components capture quite a lot of the variance.
np.round(s**2 / sum(s**2), 2)
We can also show this in the form of what is usually called a "scree plot".
plt.plot(s**2);
Thus, we expect that if we were to do a rank 3 approximation, we should get back data that's pretty close to where we started, as just those 3 dimensions capture 97% of the variance.
body_data_rank_3_approximation = compute_rank_k_approximation(body_data_centered, 3) + np.mean(body_data, axis = 0)
body_data_rank_3_approximation.head(5)
body_data.head(5)
One very interesting thing we can do is try to plot the principal components themselves. In this case, let's plot only the first two.
u, s, vt = np.linalg.svd(body_data_centered, full_matrices = False)
pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
np.argmax(pcs[:, 0])
body_data.iloc[38, :]
from ds100_utils import fetch_and_cache
import yaml
from datetime import datetime
base_url = 'https://github.com/unitedstates/congress-legislators/raw/master/'
legislators_path = 'legislators-current.yaml'
f = fetch_and_cache(base_url + legislators_path, legislators_path)
legislators_data = yaml.load(open(f))
def to_date(s):
return datetime.strptime(s, '%Y-%m-%d')
legs = pd.DataFrame(
columns=['leg_id', 'first', 'last', 'gender', 'state', 'chamber', 'party', 'birthday'],
data=[[x['id']['bioguide'],
x['name']['first'],
x['name']['last'],
x['bio']['gender'],
x['terms'][-1]['state'],
x['terms'][-1]['type'],
x['terms'][-1]['party'],
to_date(x['bio']['birthday'])] for x in legislators_data])
legs.head(3)
# February 2019 House of Representatives roll call votes
# Downloaded using https://github.com/eyeseast/propublica-congress
votes = pd.read_csv('votes.csv')
votes = votes.astype({"roll call": str})
votes.head()
def was_yes(s):
if s.iloc[0] == 'Yes':
return 1
else:
return 0
vote_pivot = votes.pivot_table(index='member',
columns='roll call',
values='vote',
aggfunc=was_yes,
fill_value=0)
print(vote_pivot.shape)
vote_pivot.head()
vote_pivot_centered = vote_pivot - np.mean(vote_pivot, axis = 0)
vote_pivot_centered.head(5)
u, s, vt = np.linalg.svd(vote_pivot_centered, full_matrices = False)
np.round(s**2 / sum(s**2), 2)
plt.plot(s**2)
pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
vote2d = pd.DataFrame({
'member': vote_pivot.index,
'pc1': pcs[:, 0],
'pc2': pcs[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')
vote2d[vote2d['pc1'] < 0]['party'].value_counts()
#top right only
vote2d.query('pc2 > -2 and pc1 > 0')['party'].value_counts()
sns.scatterplot(x="pc1", y="pc2", hue="party", data = vote2d);
vote2d['pc1_jittered'] = vote2d['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
vote2d['pc2_jittered'] = vote2d['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
sns.scatterplot(x="pc1_jittered", y="pc2_jittered", hue="party", data = vote2d);
vote2d.head(5)
vote2d[vote2d['pc1'] > 0]['party'].value_counts()
vote2d[vote2d['pc2'] < -1]
df = votes[votes['member'].isin(vote2d[vote2d['pc2'] < -1]['member'])]
df.groupby(['member', 'vote']).size()
legs.query("leg_id == 'A000367'")
votes.head(5)
num_yes_or_no_votes_per_member = votes.query("vote == 'Yes' or vote == 'No'").groupby("member").size()
num_yes_or_no_votes_per_member.head(5)
vote_pivot_with_yes_no_count = vote_pivot.merge(num_yes_or_no_votes_per_member.to_frame(), left_index = True, right_index = True, how="outer", ).fillna(0)
vote_pivot_with_yes_no_count = vote_pivot_with_yes_no_count.rename(columns = {0: 'yes_no_count'})
vote_pivot_with_yes_no_count.head(5)
regulars = vote_pivot_with_yes_no_count.query('yes_no_count >= 30')
regulars = regulars.drop('yes_no_count', 1)
regulars.shape
regulars_centered = regulars - np.mean(regulars, axis = 0)
regulars_centered.head(5)
u, s, vt = np.linalg.svd(regulars_centered, full_matrices = False)
np.round(s**2 / sum(s**2), 2)
pcs = u * s
sns.scatterplot(x=pcs[:, 0], y=pcs[:, 1]);
vote2d = pd.DataFrame({
'member': regulars_centered.index,
'pc1': pcs[:, 0],
'pc2': pcs[:, 1]
}).merge(legs, left_on='member', right_on='leg_id')
vote2d['pc1_jittered'] = vote2d['pc1'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
vote2d['pc2_jittered'] = vote2d['pc2'] + np.random.normal(loc = 0, scale = 0.1, size = vote2d.shape[0])
sns.scatterplot(x="pc1_jittered", y="pc2_jittered", hue="party", data = vote2d);
We can also look at Vt directly to try to gain insight into why each component is as it is.
num_votes = vt.shape[1]
votes = regulars.columns
def plot_pc(k):
plt.bar(votes, vt[k, :], alpha=0.7)
plt.xticks(votes, rotation=90);
with plt.rc_context({"figure.figsize": (12, 4)}):
plot_pc(0)
# plot_pc(1)
Using SVD for PCA may lead to hard to interpret $V^T$ matrices for two reasons:
There are exist other methods for doing PCA, e.g. Sparse PCA, that will try to lead to an interpretable version of $V^T$.