import pandas as pd
import numpy as np
import scipy as sp
import plotly.express as px
import seaborn as sns

mpg = sns.load_dataset("mpg").dropna()
mpg.head()

px.histogram(mpg, x="displacement")

px.scatter(mpg, x="displacement", y="horsepower")

fig = px.scatter_3d(mpg, x="displacement", y="horsepower", z="weight",
                    width=800, height=800)
fig.update_traces(marker=dict(size=3))

fig = px.scatter_3d(mpg, x="displacement", 
                    y="horsepower", 
                    z="weight", 
                    color="model_year",
                    width=800, height=800, 
                    opacity=.7)
fig.update_traces(marker=dict(size=5))

fig = px.scatter_3d(mpg, x="displacement", 
                    y="horsepower", 
                    z="weight", 
                    color="model_year",
                    size="mpg",
                    symbol="origin",
                    width=900, height=800, 
                    opacity=.7)
# hide color scale legend on the plotly fig
fig.update_layout(coloraxis_showscale=False)

from sklearn.decomposition import PCA
pca = PCA(n_components=2,)

components = pca.fit_transform(mpg[["displacement", "horsepower", "weight", "model_year"]])
mpg[["z1", "z2"]] = components
mpg.head()

px.scatter(mpg, x="z1", y="z2", color="model_year", hover_data=["displacement", "horsepower", "weight", "name"])

rectangle = pd.read_csv("data/rectangle_data.csv")
rectangle

px.scatter_3d(rectangle, x="width", y="height", z="area", 
              width=800, height=800)

X = rectangle - np.mean(rectangle, axis = 0)
X

U, S, Vt = np.linalg.svd(X, full_matrices = False)

print("Shape of U", U.shape)
print("Shape of S", S.shape)
print("Shape of Vt", Vt.shape)

Shape of U (100, 4)
Shape of S (4,)
Shape of Vt (4, 4)

S

array([1.97388075e+02, 2.74346257e+01, 2.32626119e+01, 8.53898299e-15])

np.isclose(S[3], 0)

True

S.round(5)

array([197.38808,  27.43463,  23.26261,   0.     ])

Sm = np.diag(S)
Sm

array([[1.97388075e+02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.74346257e+01, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 2.32626119e+01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8.53898299e-15]])

pd.DataFrame(U).head(5)

pd.DataFrame(Vt)

Vt.shape

(4, 4)

display(pd.DataFrame(U @ Sm @ Vt).head(5))
display(pd.DataFrame(X).head(5))

X = rectangle - np.mean(rectangle, axis = 0)
X.head(10)

Xstd = X / np.std(X, axis = 0)

U, S, Vt = np.linalg.svd(X, full_matrices = False)

pd.DataFrame(np.diag(S))

pd.DataFrame(np.round(S**2 / X.shape[0], 2))

U, S, Vt = np.linalg.svd(Xstd, full_matrices = False)

pd.DataFrame(np.round(S**2 / X.shape[0], 2))

Z = U[:, :2] @ np.diag(S[:2])
pd.DataFrame(Z).head()

Z = Xstd.to_numpy() @ Vt.T[:,:2]
pd.DataFrame(Z).head()

px.scatter(x=Z[:, 0], y=Z[:, 1])

pca = PCA(2)
pd.DataFrame(pca.fit_transform(rectangle)).head(5)

pd.DataFrame(pca.fit_transform(X)).head(5)

pd.DataFrame(pca.fit_transform(Xstd)).head(5)

pd.DataFrame(np.cov(Z.T))

rectangle.head()

k = 2
U, S, Vt = np.linalg.svd(Xstd, full_matrices = False)
scaling = np.diag(np.std(X, axis = 0))
# scaling = np.eye(X.shape[1])
Z = U[:,:k] @ np.diag(S[:k])

rectangle_hat = pd.DataFrame(
    (Z @ Vt[:k, :]) @ scaling + np.mean(rectangle, axis = 0).to_numpy(),
    columns = rectangle.columns)

display(rectangle_hat.head(3))

fig = px.scatter_3d(rectangle, x="width", y="height", z="area",
                    width=800, height=800)
fig.add_scatter3d(x=rectangle_hat["width"], y=rectangle_hat["height"], z=rectangle_hat["area"], 
                  mode="markers", name = "approximation")

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino

	mpg	cylinders	displacement	horsepower	weight	acceleration	model_year	origin	name	z1	z2
0	18.0	8	307.0	130.0	3504	12.0	70	usa	chevrolet chevelle malibu	536.436716	50.754633
1	15.0	8	350.0	165.0	3693	11.5	70	usa	buick skylark 320	730.333026	79.063105
2	18.0	8	318.0	150.0	3436	11.0	70	usa	plymouth satellite	470.971839	75.338465
3	16.0	8	304.0	150.0	3433	12.0	70	usa	amc rebel sst	466.393030	62.448249
4	17.0	8	302.0	140.0	3449	10.5	70	usa	ford torino	481.657494	55.643203

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

	width	height	area	perimeter
0	2.97	1.35	24.78	8.64
1	-3.03	-0.65	-15.22	-7.36
2	-4.03	-1.65	-20.22	-11.36
3	3.97	-1.65	3.78	4.64
4	3.97	3.35	48.78	14.64
...	...	...	...	...
95	2.97	0.35	16.78	6.64
96	2.97	2.35	32.78	10.64
97	-4.03	-0.65	-19.22	-9.36
98	-4.03	1.35	-17.22	-5.36
99	-3.03	1.35	-11.22	-3.36

	0	1	2	3
0	-0.133910	0.005930	0.034734	0.168825
1	0.086354	-0.079515	0.014948	0.775872
2	0.117766	-0.128963	0.085774	-0.061736
3	-0.027274	0.183177	0.010895	-0.053764
4	-0.258806	-0.094295	0.090270	-0.026449

Lecture 24 – Data 100, Spring 2024¶

Working with High Dimensional Data¶

Singular Value Decomposition¶

PCA with SVD¶

Step 1: Center the Data Matrix $X$¶

Step 2: Get the SVD of centered $X$¶

Step 3 Computing Approximations to the Data¶

Using $Z = U * S$¶

Using $Z = X * V$¶

Lower Rank Approximation of X¶

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

	0	1	2	3
0	-0.098631	-0.072956	-0.931226	-0.343173
1	0.668460	-0.374186	-0.258375	0.588548
2	0.314625	-0.640483	0.257023	-0.651715
3	0.666667	0.666667	0.000000	-0.333333

	0	1	2	3
0	197.388075	0.000000	0.000000	0.000000e+00
1	0.000000	27.434626	0.000000	0.000000e+00
2	0.000000	0.000000	23.262612	0.000000e+00
3	0.000000	0.000000	0.000000	8.538983e-15

	0	1
0	-2.165116	0.250465
1	1.659346	-0.502647
2	2.462645	-0.416318
3	-0.856182	1.483758
4	-3.884287	-0.182623

	0	1
0	26.432217	0.162686
1	-17.045285	-2.181451
2	-23.245695	-3.538040
3	5.383546	5.025395
4	51.085217	-2.586948

	0	1
0	2.165116	0.250465
1	-1.659346	-0.502647
2	-2.462645	-0.416318
3	0.856182	1.483758
4	3.884287	-0.182623

	width	height	area	perimeter
0	8.111911	6.093164	45.859191	28.410150
1	2.103964	4.086549	6.011204	12.381026
2	1.285956	3.238055	-2.470229	9.048021

	0
0	389.62
1	7.53
2	5.41
3	0.00

	0
0	2.89
1	1.03
2	0.09
3	0.00

	0	1
0	2.915514e+00	-3.700743e-16
1	-3.700743e-16	1.037467e+00

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16

	width	height	area	perimeter
0	8	6	48	28
1	2	4	8	12
2	1	3	3	8
3	9	3	27	24
4	9	8	72	34
...	...	...	...	...
95	8	5	40	26
96	8	7	56	30
97	1	4	4	10
98	1	6	6	14
99	2	6	12	16