import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(23) #kallisti

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
sns.set()


rectangle = pd.read_csv("data/rectangle_data.csv")
rectangle.head(5)


U, S, Vt = np.linalg.svd(rectangle, full_matrices = False)


pd.DataFrame(U).head(5)


U.shape

(100, 4)

S

array([3.62932568e+02, 6.29904732e+01, 2.56544651e+01, 7.67147618e-15])


np.isclose(S[3], 0)

True


np.round(S)

array([363.,  63.,  26.,   0.])


pd.DataFrame(Vt)


Vt.shape

(4, 4)


pd.DataFrame(U * S @ Vt).head(5)


centered_df = rectangle - np.mean(rectangle, axis = 0)
centered_df.head(5)


U, S, Vt = np.linalg.svd(centered_df, full_matrices = False)


U.shape

(100, 4)


pd.DataFrame(U).head(5)


pd.DataFrame(np.diag(np.round(S, 1)))


pd.DataFrame(Vt)


pd.DataFrame(U @ np.diag(S)).head(5)


pd.DataFrame(centered_df @ Vt.T).head(5)


two_PCs = (U @ np.diag(S))[:, :2]


pd.DataFrame(two_PCs)


(centered_df @ Vt.T).iloc[:, :2]


pd.DataFrame(U * S @ Vt).head(5)


centered_df.head(5)


pd.DataFrame(U[:, 0:3] * S[0:3] @ Vt[0:3, ]).head(5)


pd.DataFrame(U[:, 0:2] * S[0:2] @ Vt[0:2, ]).head(5)


pd.DataFrame(U[:, 0:1] * S[0:1] @ Vt[0:1, ]).head(5)

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34

	0	1	2	3
0	-0.155151	0.064830	-0.029935	0.833096
1	-0.038370	-0.089155	0.062019	-0.454464
2	-0.020357	-0.081138	0.058997	0.004259
3	-0.101519	-0.076203	-0.148160	-0.019153
4	-0.218973	0.206423	0.007274	-0.062668

	0	1	2	3
0	-0.146436	-0.129942	-0.810020	-0.552756
1	-0.192736	-0.189128	0.586348	-0.763727
2	-0.704957	0.709155	0.007952	0.008396
3	-0.666667	-0.666667	0.000000	0.333333

	0	1	2	3
0	8.0	6.0	48.0	28.0
1	2.0	4.0	8.0	12.0
2	1.0	3.0	3.0	8.0
3	9.0	3.0	27.0	24.0
4	9.0	8.0	72.0	34.0

	width	height	area	perimeter
0	2.97	1.35	24.78	8.64
1	-3.03	-0.65	-15.22	-7.36
2	-4.03	-1.65	-20.22	-11.36
3	3.97	-1.65	3.78	4.64
4	3.97	3.35	48.78	14.64

Principal Component Analysis, I¶

Singular Value Decomposition¶

PCA with SVD¶

Step 1: Center the Data Matrix $X$¶

Step 2: Get the SVD of centered $X$¶

Step 3: Multiply either $U\Sigma$ or $XV$¶

Step 4: Get the first $k$ columns¶

Lower Rank Approximation of $X$¶

	0	1	2	3
0	-0.133910	0.005930	0.034734	-0.296836
1	0.086354	-0.079515	0.014948	0.711478
2	0.117766	-0.128963	0.085774	-0.065318
3	-0.027274	0.183177	0.010895	-0.031055
4	-0.258806	-0.094295	0.090270	-0.032818

	0	1	2	3
0	-0.098631	-0.072956	-0.931226	-0.343173
1	0.668460	-0.374186	-0.258375	0.588548
2	0.314625	-0.640483	0.257023	-0.651715
3	0.666667	0.666667	0.000000	-0.333333

	0	1	2	3
0	-26.432217	0.162686	0.807998	-2.738093e-15
1	17.045285	-2.181451	0.347732	6.562857e-15
2	23.245695	-3.538040	1.995334	-6.025133e-16
3	-5.383546	5.025395	0.253448	-2.864630e-16
4	-51.085217	-2.586948	2.099919	-3.027184e-16

	0	1	2	3
0	2.715784	1.867508	24.572326	9.166584
1	-3.139405	-0.427284	-15.309375	-7.133378
2	-4.657782	-0.372023	-20.732847	-10.059611
3	3.890259	-1.487671	3.714858	4.805176
4	3.309313	4.694962	48.240272	16.008549

	0	1	2	3
0	2.607034	1.928383	24.614360	9.070835
1	-1.681193	-1.243552	-15.873008	-5.849490
2	-2.292745	-1.695908	-21.646989	-7.977306
3	0.530984	0.392761	5.013297	1.847490
4	5.038583	3.726962	47.571869	17.531091

	0	1	2
0	197.4	0.0	0.0
1	0.0	27.4	0.0
2	0.0	0.0	23.3
3	0.0	0.0	0.0

	0	1
0	-26.432217	0.162686
1	17.045285	-2.181451
2	23.245695	-3.538040
3	-5.383546	5.025395
4	-51.085217	-2.586948
...	...	...
95	-18.223108	1.426779
96	-34.641325	-1.101407
97	21.555166	-2.993505
98	18.174109	-1.904436
99	11.801777	-1.609133

	0	1
0	-26.432217	0.162686
1	17.045285	-2.181451
2	23.245695	-3.538040
3	-5.383546	5.025395
4	-51.085217	-2.586948
...	...	...
95	-18.223108	1.426779
96	-34.641325	-1.101407
97	21.555166	-2.993505
98	18.174109	-1.904436
99	11.801777	-1.609133