import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


grades = pd.read_csv('grades.csv')
grades.head(5)


grades.shape

(787, 5)


means = np.mean(grades, axis = 0)
means

Lab Total           0.988739
Discussion Total    0.965216
Homework Total      0.907690
Midterm             0.735748
Final               0.603616
dtype: float64


grades_centered = grades - means
grades_centered.head()


X = grades_centered[['Midterm', 'Final']].to_numpy()
X

array([[ 0.16085973,  0.02813011],
       [ 0.13230973,  0.16638408],
       [-0.05750601, -0.09813973],
       ...,
       [ 0.18709399,  0.25368567],
       [ 0.05977917,  0.212019  ],
       [ 0.05283473,  0.03813011]])


plt.figure(figsize = (4.5, 4), dpi = 100)
ax = sns.scatterplot(x = X[:, 0], y = X[:, 1], s = 5, alpha = 1)
ax.set_aspect('equal')
ax.set_xlabel('Midterm Exam')
ax.set_ylabel('Final Exam')
ax.set_ylim(-.55, .55)
ax.set_xlim(-.55, .55);


(X.T @ X) / len(X)

array([[0.02159435, 0.02052681],
       [0.02052681, 0.03665306]])


cov = np.cov(X, rowvar = False, ddof = 0)
cov

array([[0.02159435, 0.02052681],
       [0.02052681, 0.03665306]])


eigenvalues, eigenvectors = np.linalg.eigh(cov)
eigenvalues

array([0.00725956, 0.05098785])


eigenvectors

array([[-0.81986889,  0.57255131],
       [ 0.57255131,  0.81986889]])


scale_eigenvalues = 7.5

plt.figure(figsize = (4.5, 4), dpi = 100)
ax = sns.scatterplot(x = X[:, 0], y = X[:, 1], s = 5, alpha = 1)
ax.arrow(0, 0, scale_eigenvalues*eigenvalues[1] * eigenvectors[1, 0], scale_eigenvalues*eigenvalues[1] * eigenvectors[1, 1], head_width = .02, lw = 2, color = 'black')
ax.arrow(0, 0, scale_eigenvalues*eigenvalues[0] * eigenvectors[0, 0], scale_eigenvalues*eigenvalues[0] * eigenvectors[0, 1], head_width = .02, lw = 2, color = 'black')
ax.set_aspect('equal')
ax.set_xlabel('Midterm Exam')
ax.set_ylabel('Final Exam')
ax.set_ylim(-.55, .55)
ax.set_xlim(-.55, .55);


import plotly.express as px

alphas = np.arange(0, 360)

dfs = []
for alpha in alphas:
    proj_vec = np.array([np.cos(alpha  * np.pi / 180), np.sin(alpha * np.pi / 180)])
    proj_vals = (X @ proj_vec) #/ np.linalg.norm(proj_vec)    # dividing is redundant here
    dfs.append(pd.DataFrame(data={
        'Midterm': grades_centered['Midterm'],
        'Final': grades_centered['Final'],
        'Alpha': [alpha] * len(grades),
        'Score': proj_vals
    }))

scatter_df = pd.concat(dfs)
px.histogram(scatter_df,
            x = 'Score',
            animation_frame = 'Alpha',
            histnorm = 'probability density',
            range_x = [-0.75, 0.75],
            range_y = [0, 5], nbins = 30,
            title = 'Distribution of Scores for Different Linear Combinations')


u, s, vt = np.linalg.svd(X, full_matrices = False)
u.shape, s.shape, vt.shape

((787, 2), (2,), (2, 2))

u

array([[-0.01818   ,  0.04843769],
       [-0.03349327,  0.00552797],
       [ 0.01789954,  0.00378314],
       ...,
       [-0.04974407,  0.00340738],
       [-0.03284401, -0.03028169],
       [-0.00971049,  0.00898908]])

s

array([6.33462233, 2.39024517])

vt

array([[-0.57255131, -0.81986889],
       [ 0.81986889, -0.57255131]])


s = np.diag(s)
s

array([[6.33462233, 0.        ],
       [0.        , 2.39024517]])


XTX_eigenvalues = np.linalg.eigh(X.T @ X)[0]
np.sqrt(XTX_eigenvalues)

array([2.39024517, 6.33462233])

s

array([[6.33462233, 0.        ],
       [0.        , 2.39024517]])


eigenvalues, eigenvectors = np.linalg.eigh(cov)
eigenvalues * len(X), XTX_eigenvalues

(array([ 5.71327195, 40.12744009]), array([ 5.71327195, 40.12744009]))


scores = X @ vt[0, :]


ax = sns.rugplot(scores, height = 1, linewidth = .2, alpha = .5)
ax.set_xlim(-2/3, 2/3);


ax = sns.rugplot(X @ vt[1, :], height = 1, linewidth = .2, alpha = .5)
ax.set_xlim(-2/3, 2/3);


XV = (X @ vt.T)
XV

array([[-0.11516345,  0.11577796],
       [-0.21216724,  0.01321321],
       [ 0.11338685,  0.00904264],
       ...,
       [-0.31510989,  0.00814448],
       [-0.20805443, -0.07238067],
       [-0.06151229,  0.02148611]])


XV.shape, X.shape

((787, 2), (787, 2))


grades_centered.head()


X = grades_centered.to_numpy()


u, s, vt = np.linalg.svd(X, full_matrices = False)

u

array([[-0.02063223, -0.00716831,  0.03907254, -0.02698313, -0.00444346],
       [-0.02180753,  0.03789669, -0.01851768, -0.04351571, -0.0247481 ],
       [ 0.01360192, -0.01948476, -0.00219371, -0.00769354, -0.00562161],
       ...,
       [-0.04458103,  0.02318513, -0.00314228, -0.01317884, -0.00446547],
       [-0.02976218,  0.0143348 , -0.03020208,  0.00787507, -0.00040914],
       [-0.0079528 ,  0.00442414, -0.00198902, -0.02148821,  0.00346208]])


np.diag(s)

array([[6.99268458, 0.        , 0.        , 0.        , 0.        ],
       [0.        , 3.59503593, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 2.46936078, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 2.21126502, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.56338675]])

vt

array([[-0.07528496, -0.19150879, -0.44440908, -0.50503701, -0.71070018],
       [-0.17658667, -0.56652566, -0.65001223,  0.22462175,  0.41820448],
       [-0.01929628, -0.47439113,  0.30708355,  0.6417103 , -0.51815856],
       [ 0.15124888, -0.64545141,  0.48593351, -0.5229386 ,  0.22565483],
       [-0.96948479,  0.02680681,  0.22260498, -0.09605103,  0.02453289]])


np.abs(vt[0]) / np.sum(np.abs(vt[0]))

array([0.03906969, 0.09938492, 0.23062943, 0.26209275, 0.3688232 ])


pc = X @ vt.T
pc

array([[-0.14427467, -0.02577032,  0.09648421, -0.05966685, -0.00694685],
       [-0.1524932 ,  0.13623996, -0.04572683, -0.09622477, -0.03869085],
       [ 0.09511397, -0.07004841, -0.00541707, -0.01701247, -0.00878876],
       ...,
       [-0.31174105,  0.08335138, -0.00775941, -0.02914191, -0.00698126],
       [-0.20811756,  0.05153414, -0.07457982,  0.01741386, -0.00063964],
       [-0.05561143,  0.01590494, -0.00491161, -0.04751613,  0.00541256]])


ax = sns.lineplot(x = np.arange(1, 6), y = s ** 2 / sum(s ** 2))
ax.set_xticks(np.arange(1, 6))
ax.set_xlabel('$i^{th}$ Principal Component')
ax.set_ylabel('Proportion of Variance');


grades_centered.iloc[np.argmin(pc[:, 0])]

Lab Total           0.011261
Discussion Total    0.034784
Homework Total      0.092310
Midterm             0.197897
Final               0.392575
Name: 37, dtype: float64


grades_centered.iloc[np.argmax(pc[:, 1])]

Lab Total          -0.010961
Discussion Total   -0.854105
Homework Total     -0.650369
Midterm            -0.204883
Final               0.095606
Name: 371, dtype: float64

	Lab Total	Discussion Total	Homework Total	Midterm	Final
0	1.0	1.0	0.987646	0.896607	0.631746
1	1.0	1.0	0.817488	0.868057	0.770000
2	1.0	1.0	0.899066	0.678242	0.505476
3	1.0	1.0	1.000000	0.888889	0.964444
4	1.0	1.0	0.976408	0.912037	0.722381

Lecture 23: PCA¶

Midterm Exam and Final Exam¶

SVD for Midterm and Final¶

Combined Midterm and Final Score¶

PCA on our Full Dataset¶

Picking the number of components¶

	Lab Total	Discussion Total	Homework Total	Midterm	Final
0	0.011261	0.034784	0.079956	0.160860	0.028130
1	0.011261	0.034784	-0.090202	0.132310	0.166384
2	0.011261	0.034784	-0.008624	-0.057506	-0.098140
3	0.011261	0.034784	0.092310	0.153141	0.360829
4	0.011261	0.034784	0.068718	0.176289	0.118765