import pandas as pd
import numpy as np
import plotly.express as px

df = pd.read_csv("data/2d.csv")
df.head(3)

fig = px.scatter(df, x='x', y='y', title='2D Data', width=700, height=700)
fig.update_xaxes(range=[-10, 10], zeroline=True, zerolinewidth=2, zerolinecolor='black')
fig.update_yaxes(range=[-10, 10], zeroline=True, zerolinewidth=2, zerolinecolor='black')

centered_df = df - df.mean(axis=0)

fig = px.scatter(centered_df, x='x', y='y', title='2D Data', width=700, height=700)
fig.update_xaxes(range=[-10, 10], zeroline=True, zerolinewidth=2, zerolinecolor='black')
fig.update_yaxes(range=[-10, 10], zeroline=True, zerolinewidth=2, zerolinecolor='black')

U, S, Vt = np.linalg.svd(centered_df, full_matrices = False)

centered_df[["z1", "z2"]] = centered_df[['x', 'y']] @ Vt.T
# centered_df[["z1", "z2"]] = U @ np.diag(S) # does the same thing
centered_df.head(3)

fig = px.scatter(centered_df, x='z1', y='z2', title='2D Data', width=700, height=700)
fig.update_xaxes(range=[-10, 10], zeroline=True, zerolinewidth=2, zerolinecolor='black')
fig.update_yaxes(range=[-10, 10], zeroline=True, zerolinewidth=2, zerolinecolor='black')

	x	y
0	2.311043	5.436627
1	2.951447	6.093710
2	2.628517	6.776799

	x	y	z1	z2
0	-0.782371	-1.708284	-1.878825	0.018793
1	-0.141967	-1.051201	-1.017886	-0.298473
2	-0.464897	-0.368111	-0.525540	0.274668

Lecture 25 – Data 100, Spring 2024¶

PCA is a Linear Transformation¶