import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import networkx as nx


scores = pd.read_fwf('ncaa_scores19.txt', colspecs = [(0, 10), (11, 36), (36, 39), (40, 65), (65, 68)], names = ['Date', 'Winner', 'WinnerPts', 'Loser', 'LoserPts'])
scores['Winner'] = scores['Winner'].str.replace('@', '')
scores['Loser'] = scores['Loser'].str.replace('@', '')
scores['Margin'] = scores['WinnerPts'] - scores['LoserPts']
scores.head()


len(scores)

5603


scores['TeamA'] = scores['Winner']
scores.loc[scores['Winner'] >= scores['Loser'], 'TeamA'] = scores.loc[scores['Winner'] >= scores['Loser'], 'Loser']
scores['TeamB'] = scores['Winner']
scores.loc[scores['Winner'] <= scores['Loser'], 'TeamB'] = scores.loc[scores['Winner'] <= scores['Loser'], 'Loser']
scores.head()


scores.loc[(scores['TeamA'] == 'California') | (scores['TeamB'] == 'California'), ['TeamA', 'TeamB']].value_counts()

TeamA       TeamB        
Arizona     California       2
California  Washington St    2
            Colorado         2
            Washington       2
            USC              2
            UCLA             2
            Stanford         2
Arizona St  California       2
California  St John's        1
            Utah             1
            Temple           1
            St Mary's CA     1
            Santa Clara      1
            Seattle          1
            San Jose St      1
            San Francisco    1
            San Diego St     1
            Oregon St        1
            Oregon           1
            Hampton          1
            Fresno St        1
Cal Poly    California       1
California  Yale             1
dtype: int64


matchups = scores[['TeamA', 'TeamB']].value_counts()
matchups

TeamA          TeamB        
Missouri KC    Utah Valley      3
CS Fullerton   UC Irvine        3
Robert Morris  St Francis NY    3
Seton Hall     Villanova        3
Long Beach St  UC Irvine        3
                               ..
F Dickinson    Gonzaga          1
               Holy Cross       1
               Lafayette        1
               Massachusetts    1
Gonzaga        Texas Tech       1
Length: 3987, dtype: int64


teams = pd.read_csv('conferences.csv', names = ['Team', 'Conference']).set_index('Team')
teams.head()


conferences = pd.DataFrame(teams['Conference'].value_counts().sort_index()).rename(columns = {'Conference': 'count'})
conferences['index'] = np.arange(len(conferences))
conferences


teams = teams.merge(conferences, left_on = 'Conference', right_index = True).sort_values('Team').drop('count', axis = 1)


G = nx.Graph()
G.add_nodes_from(teams.index)
for matchup, count in matchups.iteritems():
    G.add_edge(matchup[0], matchup[1], weight = count)
weights = [G[u][v]['weight'] for u, v in G.edges()]
len(G.nodes), len(G.edges)

(353, 3987)


ca_schools = ['Cal Poly', 'CS Bakersfield', 'CS Fullerton', 'CS Northridge', 'CS Sacramento', 'Long Beach St', 'Cal Baptist', 'Fresno St', 'Loy Marymount', 'Pacific', 'Pepperdine', "St Mary's CA", 'San Diego St', 'San Diego', 'San Francisco', 'San Jose St', 'Santa Clara', 'USC', 'Stanford', 'California', 'UC Davis', 'UC Irvine', 'UC Riverside', 'UC Santa Barbara', 'UCLA']
ca_graph = G.subgraph(ca_schools)

ca_confs = list(teams.loc[list(ca_graph.nodes), 'index'].replace({6:0, 9:1, 18:2, 21:3, 30:4, 31:5}))
ca_schools = list(teams.loc[list(ca_graph.nodes)].index)
nx.draw_kamada_kawai(ca_graph, with_labels = True, node_color = ca_confs, cmap = 'Set1', vmin = 0, vmax = 5)


A = nx.linalg.graphmatrix.adjacency_matrix(G).toarray()
A

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])


D = np.diag(np.sum(A, axis = 0))
D

array([[30,  0,  0, ...,  0,  0,  0],
       [ 0, 31,  0, ...,  0,  0,  0],
       [ 0,  0, 31, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ..., 35,  0,  0],
       [ 0,  0,  0, ...,  0, 29,  0],
       [ 0,  0,  0, ...,  0,  0, 30]])


L = D - A
L

array([[30,  0,  0, ...,  0,  0,  0],
       [ 0, 31,  0, ...,  0,  0,  0],
       [ 0,  0, 31, ...,  0,  0, -1],
       ...,
       [ 0,  0,  0, ..., 35,  0,  0],
       [ 0,  0,  0, ...,  0, 29,  0],
       [ 0,  0, -1, ...,  0,  0, 30]])


L = nx.linalg.laplacianmatrix.laplacian_matrix(G).toarray()
L

array([[30,  0,  0, ...,  0,  0,  0],
       [ 0, 31,  0, ...,  0,  0,  0],
       [ 0,  0, 31, ...,  0,  0, -1],
       ...,
       [ 0,  0,  0, ..., 35,  0,  0],
       [ 0,  0,  0, ...,  0, 29,  0],
       [ 0,  0, -1, ...,  0,  0, 30]])


eigenvalues, eigenvectors = np.linalg.eigh(L) 
eigenvalues = eigenvalues[::-1]
eigenvectors = eigenvectors[:, ::-1]


eigenvalues

array([ 4.46219726e+01,  4.41826302e+01,  4.37293307e+01,  4.35184974e+01,
        4.32449866e+01,  4.30987686e+01,  4.27793711e+01,  4.26518290e+01,
        4.26147783e+01,  4.24977309e+01,  4.22325073e+01,  4.21338632e+01,
        4.19441610e+01,  4.17610321e+01,  4.16921148e+01,  4.14594695e+01,
        4.13478239e+01,  4.11521964e+01,  4.10999830e+01,  4.10229767e+01,
        4.08521014e+01,  4.08335141e+01,  4.06825749e+01,  4.05608720e+01,
        4.04735786e+01,  4.03586636e+01,  4.02871820e+01,  4.01981439e+01,
        4.01609994e+01,  4.00423316e+01,  3.99512138e+01,  3.99408082e+01,
        3.97759610e+01,  3.97123016e+01,  3.96632047e+01,  3.95809648e+01,
        3.95491604e+01,  3.94110766e+01,  3.93480383e+01,  3.93088599e+01,
        3.91837212e+01,  3.91612140e+01,  3.90999591e+01,  3.90540997e+01,
        3.89563183e+01,  3.88285748e+01,  3.87520273e+01,  3.87350934e+01,
        3.87086481e+01,  3.86235854e+01,  3.85680432e+01,  3.85353777e+01,
        3.85299270e+01,  3.83600769e+01,  3.83011332e+01,  3.82581998e+01,
        3.81895555e+01,  3.81680898e+01,  3.81167678e+01,  3.80632805e+01,
        3.80227538e+01,  3.79360727e+01,  3.78753357e+01,  3.78381460e+01,
        3.78280858e+01,  3.77547929e+01,  3.76879733e+01,  3.76414837e+01,
        3.76022326e+01,  3.75306921e+01,  3.75020318e+01,  3.74282638e+01,
        3.73651601e+01,  3.73330516e+01,  3.73127371e+01,  3.72539419e+01,
        3.72304249e+01,  3.71723270e+01,  3.71141233e+01,  3.70519040e+01,
        3.70300655e+01,  3.69863303e+01,  3.69380015e+01,  3.68803133e+01,
        3.68310429e+01,  3.67883669e+01,  3.67607491e+01,  3.66771905e+01,
        3.66143805e+01,  3.65929017e+01,  3.65819922e+01,  3.65510455e+01,
        3.65236818e+01,  3.64760100e+01,  3.64468045e+01,  3.62940059e+01,
        3.62715294e+01,  3.62459213e+01,  3.61885025e+01,  3.61357765e+01,
        3.61070385e+01,  3.60569856e+01,  3.60000270e+01,  3.59651269e+01,
        3.59618355e+01,  3.58410157e+01,  3.58296809e+01,  3.57640440e+01,
        3.57395703e+01,  3.56960159e+01,  3.56112412e+01,  3.55634022e+01,
        3.55173256e+01,  3.55002963e+01,  3.54599869e+01,  3.54430585e+01,
        3.54313215e+01,  3.52992407e+01,  3.52850803e+01,  3.52597169e+01,
        3.52429084e+01,  3.51686842e+01,  3.51390927e+01,  3.51047057e+01,
        3.50741635e+01,  3.50169266e+01,  3.50024398e+01,  3.49709744e+01,
        3.49506351e+01,  3.49065948e+01,  3.48311747e+01,  3.48041296e+01,
        3.47798713e+01,  3.47138072e+01,  3.46650601e+01,  3.46223340e+01,
        3.45946641e+01,  3.45708551e+01,  3.44672875e+01,  3.44426752e+01,
        3.44194472e+01,  3.43552937e+01,  3.43450639e+01,  3.42850739e+01,
        3.42250789e+01,  3.42221595e+01,  3.41093967e+01,  3.40618251e+01,
        3.40252315e+01,  3.39661225e+01,  3.39529046e+01,  3.39140579e+01,
        3.38845749e+01,  3.38608139e+01,  3.38147319e+01,  3.37933535e+01,
        3.37024450e+01,  3.36507006e+01,  3.36144169e+01,  3.36076497e+01,
        3.36006416e+01,  3.35702168e+01,  3.35272284e+01,  3.34659537e+01,
        3.34453380e+01,  3.34068933e+01,  3.33528295e+01,  3.33352076e+01,
        3.33202591e+01,  3.32633712e+01,  3.31778334e+01,  3.31503201e+01,
        3.30960942e+01,  3.30589978e+01,  3.30126992e+01,  3.29769271e+01,
        3.29671956e+01,  3.29387754e+01,  3.28623820e+01,  3.28455602e+01,
        3.28127478e+01,  3.27610244e+01,  3.27080726e+01,  3.26884679e+01,
        3.26617874e+01,  3.26292001e+01,  3.25993261e+01,  3.25405510e+01,
        3.24829954e+01,  3.24290589e+01,  3.24098899e+01,  3.23575288e+01,
        3.23183431e+01,  3.22563686e+01,  3.22292097e+01,  3.21535504e+01,
        3.21264164e+01,  3.21119737e+01,  3.20398075e+01,  3.20044746e+01,
        3.19956088e+01,  3.19536411e+01,  3.18790562e+01,  3.18241169e+01,
        3.17938506e+01,  3.17859987e+01,  3.17253462e+01,  3.16788676e+01,
        3.16465937e+01,  3.16106029e+01,  3.15561297e+01,  3.15448878e+01,
        3.14900722e+01,  3.14277336e+01,  3.14031121e+01,  3.13394790e+01,
        3.13064947e+01,  3.12408530e+01,  3.12099754e+01,  3.11892927e+01,
        3.10794302e+01,  3.10595304e+01,  3.10117934e+01,  3.09811097e+01,
        3.09531918e+01,  3.09075048e+01,  3.08763219e+01,  3.08156077e+01,
        3.07397864e+01,  3.07361608e+01,  3.06718677e+01,  3.06229832e+01,
        3.05824254e+01,  3.05818014e+01,  3.05623909e+01,  3.04953221e+01,
        3.04492785e+01,  3.04021431e+01,  3.03836927e+01,  3.03103860e+01,
        3.02400177e+01,  3.01980763e+01,  3.01696468e+01,  3.01370760e+01,
        3.01128927e+01,  3.00763949e+01,  3.00241463e+01,  2.99900642e+01,
        2.99532547e+01,  2.98650883e+01,  2.97728239e+01,  2.97397412e+01,
        2.96973679e+01,  2.96654740e+01,  2.96139641e+01,  2.95877792e+01,
        2.95640548e+01,  2.95252611e+01,  2.94564349e+01,  2.93955987e+01,
        2.93315943e+01,  2.93201943e+01,  2.92892596e+01,  2.91789787e+01,
        2.91573230e+01,  2.91080838e+01,  2.90312449e+01,  2.89703284e+01,
        2.88419645e+01,  2.88089138e+01,  2.87897016e+01,  2.87515815e+01,
        2.87321860e+01,  2.86399373e+01,  2.85741479e+01,  2.85649715e+01,
        2.84982141e+01,  2.84758043e+01,  2.83993009e+01,  2.83224454e+01,
        2.82975054e+01,  2.82336080e+01,  2.81669889e+01,  2.80890167e+01,
        2.80356133e+01,  2.79708735e+01,  2.78862644e+01,  2.78538750e+01,
        2.77851575e+01,  2.77230258e+01,  2.76840327e+01,  2.76212120e+01,
        2.76090866e+01,  2.75224765e+01,  2.74537834e+01,  2.74132929e+01,
        2.73596632e+01,  2.73287591e+01,  2.72322341e+01,  2.71460106e+01,
        2.70928792e+01,  2.70136044e+01,  2.69556066e+01,  2.68999577e+01,
        2.68492257e+01,  2.66908485e+01,  2.66265984e+01,  2.65747468e+01,
        2.64171603e+01,  2.62678635e+01,  2.62153669e+01,  2.61996585e+01,
        2.59140698e+01,  2.58241277e+01,  2.56717562e+01,  2.54319005e+01,
        2.52226345e+01,  2.50502452e+01,  2.49489645e+01,  2.48974578e+01,
        2.40469940e+01,  1.61558478e+01,  1.59730148e+01,  1.54435755e+01,
        1.51111601e+01,  1.47349681e+01,  1.44233990e+01,  1.43218276e+01,
        1.41523793e+01,  1.41141242e+01,  1.37574930e+01,  1.36873132e+01,
        1.33667010e+01,  1.32584135e+01,  1.31227504e+01,  1.29387723e+01,
        1.25788699e+01,  1.23381120e+01,  1.21196289e+01,  1.17188085e+01,
        1.17071616e+01,  1.15527519e+01,  1.13327157e+01,  1.13044726e+01,
        1.09043675e+01,  1.03961473e+01,  1.02293787e+01,  9.76728195e+00,
        8.63121853e+00,  8.11246012e+00,  6.28601440e+00,  4.96468216e+00,
       -1.97950180e-14])


eigenvectors

array([[ 0.00769313, -0.00932888,  0.01402037, ..., -0.05419384,
         0.04167957,  0.05322463],
       [-0.0106242 ,  0.03274092,  0.0025207 , ...,  0.04422881,
         0.06263588,  0.05322463],
       [-0.00373961,  0.02518204,  0.00994416, ..., -0.01792892,
         0.00269735,  0.05322463],
       ...,
       [ 0.03174857,  0.00546717,  0.01724336, ..., -0.01295909,
        -0.01353248,  0.05322463],
       [-0.03599232,  0.03798604, -0.01938344, ...,  0.07446431,
        -0.06677513,  0.05322463],
       [-0.01411188,  0.00155191, -0.01603846, ..., -0.0156431 ,
        -0.02157362,  0.05322463]])


df = pd.DataFrame(eigenvectors, columns = np.core.defchararray.add('v_', np.arange(353).astype(str)), index = teams.index)
teams = teams.join(df)

import plotly.express as px
px.scatter(data_frame = teams.reset_index(), x = 'v_352', y = 'v_351', color = 'Conference', 
           range_x = [0, 1/12], range_y = [-1/6, 1/6], opacity = .2, hover_data = ['Team'])


px.scatter(data_frame = teams.reset_index(), x = 'v_351', y = 'v_350', color = 'Conference',
           opacity = 1, hover_data = ['Team'])


coordinates = eigenvectors[:, -32:]
coordinates.shape

(353, 32)


from sklearn.cluster import KMeans

model = KMeans(n_clusters = 32, max_iter = 10000, random_state = 100)
model.fit(coordinates)

KMeans(max_iter=10000, n_clusters=32, random_state=100)


model.labels_

array([15, 11, 28, 20, 23, 23, 23,  8,  0,  4,  4,  0, 23, 20,  0,  8, 20,
       13, 18, 28, 21, 13,  5, 24, 11,  6,  8, 28, 10, 29, 25,  8, 28, 12,
       28, 19,  9,  9,  3, 19,  9,  4, 27, 16, 15, 25, 27,  7, 17, 19,  2,
       17,  6, 31,  0, 30,  8,  4, 11, 29,  2,  5, 29, 12, 29, 14, 14, 12,
       30,  5, 26, 31, 10, 30,  6, 14, 13, 13, 28,  3, 17,  2, 30, 10, 25,
        7, 22, 16, 20,  5,  7,  6, 14, 11, 17, 14,  0, 27, 14, 12, 20,  0,
        6, 18, 23, 19, 27, 24, 29,  9, 27, 30,  8,  2, 15,  5, 31, 31,  3,
        3,  1, 10, 15,  1, 10, 16,  1, 21, 23, 22, 13, 30, 21, 21, 22, 28,
       20, 25, 20, 14,  8, 15,  8, 22, 22,  9, 27,  0,  7,  6, 18,  8, 10,
       24,  5, 23,  7, 24, 16, 16, 12,  7,  1, 14, 15,  2, 17,  6, 28,  1,
        1,  1, 20, 20, 20, 19, 10, 16,  3,  3, 13,  5, 25, 13,  3, 26, 28,
       31,  5,  5,  6, 26, 22,  8,  1, 11, 24, 11, 19, 15, 16, 15,  5, 22,
        6, 26, 22,  7, 30,  3, 10,  1, 15,  6, 31, 28,  1, 21, 21,  7, 26,
        4,  4, 26, 18, 29,  1, 18,  6, 18,  3, 23, 27, 29, 12,  1, 16, 27,
       14,  7, 14, 16, 25,  1,  5, 26, 10, 27, 15, 13, 15, 13,  2, 24, 25,
       15, 17, 18, 11, 18, 11, 18,  5, 19, 12, 16,  0, 20, 26,  2,  7, 23,
        3, 14, 25, 25, 12, 14, 14, 18, 16,  4, 22, 24,  6, 15, 21, 13, 23,
        2, 20, 13, 13, 21, 20,  0, 21, 28, 30,  0,  2,  2,  7,  9,  9,  9,
        9,  2,  4,  0, 24, 27, 17, 30, 11,  4,  0,  7,  7, 19,  4, 11, 19,
       14, 17, 10, 20, 24, 12,  6,  6, 17, 26, 28, 31, 31,  7, 25,  6,  4,
        4,  3, 21,  2, 30, 27,  1, 17, 31, 11, 12, 29, 31], dtype=int32)


teams['cluster'] = model.labels_
teams


teams[['Conference', 'cluster']].value_counts()#.shape

Conference       cluster
ACC              6          15
Atlantic 10      14         14
SEC              20         14
Big Ten          1          14
C-USA            7          14
Southland        15         13
MAC              28         12
The American     2          12
Sun Belt         0          12
Pac-12           4          12
Ohio Valley      13         12
MEAC             5          12
Big South        27         11
MAAC             16         11
Mountain West    11         11
Big Sky          3          11
Horizon          31         10
Big 12           21         10
Big East         12         10
Southern         17         10
SWAC             23         10
Patriot          8          10
Northeast        25         10
Missouri Valley  10         10
CAA              30         10
West Coast       18         10
Big West         9           9
ASUN             22          9
Summit           26          9
America East     24          9
WAC              19          9
Ivy              29          8
dtype: int64

	Date	Winner	WinnerPts	Loser	LoserPts	Margin	TeamA	TeamB
0	2018-11-06	South Florida	80	Alabama A&M	63	17	Alabama A&M	South Florida
1	2018-11-06	Iowa St	79	Alabama St	53	26	Alabama St	Iowa St
2	2018-11-06	Tulsa	73	Alcorn St	56	17	Alcorn St	Tulsa
3	2018-11-06	Ball St	86	Indiana St	69	17	Ball St	Indiana St
4	2018-11-06	Cornell	86	Binghamton	75	11	Binghamton	Cornell

	Conference	index	v_0	v_1	v_2	v_3	v_4	v_5	v_6	v_7	...	v_344	v_345	v_346	v_347	v_348	v_349	v_350	v_351	v_352	cluster
Team
Abilene Chr	Southland	26	0.007693	-0.009329	0.014020	-0.011543	0.022672	-0.016362	0.001264	-0.030554	...	-0.071254	0.011189	-0.071870	0.096802	-0.014347	0.162703	-0.054194	0.041680	0.053225	15
Air Force	Mountain West	18	-0.010624	0.032741	0.002521	-0.006968	-0.012603	0.023571	0.038701	-0.023544	...	-0.068135	0.034767	0.025738	-0.119272	-0.013636	-0.006430	0.044229	0.062636	0.053225	11
Akron	MAC	15	-0.003740	0.025182	0.009944	-0.003773	0.001305	-0.020023	-0.014674	0.017447	...	-0.073507	0.012057	0.051819	0.073419	-0.143641	-0.079020	-0.017929	0.002697	0.053225	28
Alabama	SEC	23	0.009476	0.000720	-0.031148	0.059372	0.041616	-0.032372	0.083505	-0.055128	...	0.032349	-0.052711	0.008797	-0.015395	0.004918	0.024036	-0.044511	0.005263	0.053225	20
Alabama A&M	SWAC	24	-0.006395	-0.030211	-0.005560	-0.019168	-0.011639	-0.036714	0.022952	0.008351	...	-0.061766	0.019162	0.009362	-0.070629	-0.022292	0.061996	-0.031740	0.032270	0.053225	23
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Wright St	Horizon	12	-0.022203	-0.006616	-0.041241	-0.005564	0.019240	0.026638	-0.004195	0.005522	...	-0.041999	0.005003	0.007256	0.074406	-0.135750	-0.091863	-0.036042	-0.011146	0.053225	31
Wyoming	Mountain West	18	-0.000155	-0.008319	-0.007922	0.002666	-0.000594	-0.003599	0.007151	0.019719	...	-0.061826	0.037065	0.026551	-0.110060	-0.010516	-0.004850	0.038841	0.062384	0.053225	11
Xavier	Big East	5	0.031749	0.005467	0.017243	0.208324	0.073593	0.020652	-0.079958	-0.062220	...	0.019757	-0.086844	0.025428	0.008432	-0.052781	-0.028187	-0.012959	-0.013532	0.053225	12
Yale	Ivy	13	-0.035992	0.037986	-0.019383	0.000843	-0.002007	-0.026625	-0.021682	0.027465	...	0.002514	0.021548	-0.012942	-0.017957	0.006456	0.024757	0.074464	-0.066775	0.053225	29
Youngstown St	Horizon	12	-0.014112	0.001552	-0.016038	0.013200	0.020774	0.003534	0.015230	-0.000660	...	-0.042542	0.005380	0.026235	0.087848	-0.147246	-0.079361	-0.015643	-0.021574	0.053225	31

Lecture 25 - Clustering Part 2¶

Loading in Data¶

Basic EDA¶

Conferences¶

Creating our Graph¶

Spectral Clustering¶

	count	index
ACC	15	0
ASUN	9	1
America East	9	2
Atlantic 10	14	3
Big 12	10	4
Big East	10	5
Big Sky	11	6
Big South	11	7
Big Ten	14	8
Big West	9	9
C-USA	14	10
CAA	10	11
Horizon	10	12
Ivy	8	13
MAAC	11	14
MAC	12	15
MEAC	12	16
Missouri Valley	10	17
Mountain West	11	18
Northeast	10	19
Ohio Valley	12	20
Pac-12	12	21
Patriot	10	22
SEC	14	23
SWAC	10	24
Southern	10	25
Southland	13	26
Summit	9	27
Sun Belt	12	28
The American	12	29
WAC	9	30
West Coast	10	31