import numpy as np
import pandas as pd
import sklearn.linear_model as lm
from sklearn.datasets import load_boston

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tnrange

plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3
sns.set()


population = np.arange(10001)


population

array([    0,     1,     2, ...,  9998,  9999, 10000])


np.mean(population)

5000.0


sample_100 = np.random.choice(population, size=100, replace=False)
np.mean(sample_100)

4891.23


def var_estimate(f, pop, m=4000, n=100):
    """Estimate the variance of estimator f by the empirical variance.
    
    f: A function of a sample
    pop: An array representing the whole population
    m, n: Use m samples of size n to estimate the variance
    """
    estimates = []
    for j in range(m):
        sample = np.random.choice(pop, size=n, replace=False)
        estimates.append(f(sample))
    estimates = np.array(estimates)
    plt.hist(estimates, bins=30)
    plt.xlim(4000, 6000)
    return np.var(estimates)

var_estimate(np.mean, population)

80591.7460290275


83465.5906135476**0.5

288.9041201048327


var_estimate(np.mean, population, n=400)

19884.378867690735


var_estimate(np.mean, population, n=1600)

4264.277193892193


2 * np.sqrt(var_estimate(np.mean, population))

590.5010497482366


def bootstrap_var_estimate(f, sample, m=4000):
    """Estimate the variance of estimator f by the empirical variance.
    
    f: A function of a sample
    sample: An array representing a sample of size n
    m: Use m samples of size n to estimate the variance
    """
    estimates = []
    n = len(sample)
    for j in range(m):
        resample = np.random.choice(sample, size=n, replace=True)
        estimates.append(f(resample))
    estimates = np.array(estimates)
    plt.hist(estimates, bins=30)
    return np.mean((estimates - np.mean(estimates))**2) # same as np.var(estimates)

bootstrap_var_estimate(np.mean, sample_100)

92767.90456840598


np.mean(sample_100)

4891.23


sample_400 = np.random.choice(population, 400, replace=False)
bootstrap_var_estimate(np.mean, sample_400)

21106.4840493027


sample_1600 = np.random.choice(population, 1600, replace=False)
bootstrap_var_estimate(np.mean, sample_1600)

5379.208413955419


def ci(sample, estimator, confidence=95, m=1000):
    """Compute a confidence interval for an estimator.
    
    sample: A DataFrame or Series
    estimator: A function from a sample DataFrame to an estimate (number)
    """
    if isinstance(sample, np.ndarray):
        sample = pd.Series(sample)
    estimates = []
    n = sample.shape[0]
    for j in range(m):
        resample = sample.sample(n, replace=True)
        estimates.append(estimator(resample))
    estimates = np.array(estimates)
    slack = 100 - confidence
    lower = np.percentile(estimates, slack/2)
    upper = np.percentile(estimates, 100 - slack/2)
    return (lower, upper)


ci(sample_100, np.mean)

(4316.2615000000005, 5457.73025)


def bootstrap_dist(sample, estimator, m=10000):
    if isinstance(sample, np.ndarray):
        sample = pd.Series(sample)
    estimates = []
    n = sample.shape[0]
    for j in range(m):
        resample = sample.sample(n, replace=True)
        estimates.append(estimator(resample))
    plt.hist(estimates, bins=30)
    
bootstrap_dist(sample_100, np.mean)


mean_ints = [ci(np.random.choice(population, 100), np.mean)
             for _ in tnrange(100)]

/tmp/ipykernel_1810/2581637515.py:2: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`
  for _ in tnrange(100)]


mean_ints

[(5042.2787499999995, 6128.70975),
 (4737.525250000001, 5813.98825),
 (4499.9980000000005, 5597.47825),
 (4451.8995, 5527.187999999999),
 (5038.9865, 6056.23975),
 (4263.074500000001, 5266.77625),
 (4496.621499999999, 5505.88575),
 (4262.69375, 5439.482),
 (3861.4925, 5063.2515),
 (4956.09775, 6022.171),
 (3615.53425, 4701.322749999999),
 (4442.1245, 5596.071000000001),
 (4498.735750000001, 5658.71825),
 (4507.4575, 5621.028249999999),
 (4439.231, 5620.148249999999),
 (4993.7125, 6170.85675),
 (4212.04625, 5249.291499999999),
 (4229.461249999999, 5331.68425),
 (5326.70025, 6348.1515),
 (4721.3775000000005, 5800.662),
 (4873.1272500000005, 6001.91225),
 (4229.7905, 5347.226250000001),
 (4256.326499999999, 5342.71525),
 (4578.864500000001, 5733.956749999999),
 (4619.047, 5568.8805),
 (4719.15775, 5736.46375),
 (4348.665, 5609.999),
 (4386.3282500000005, 5412.3904999999995),
 (4406.5830000000005, 5573.8727499999995),
 (4433.54475, 5468.29675),
 (4657.51925, 5717.335999999999),
 (4318.048750000001, 5511.237499999999),
 (5192.8975, 6344.142999999999),
 (4500.514249999999, 5565.951999999999),
 (4717.97225, 5849.10125),
 (4333.818249999999, 5384.4405),
 (3911.399, 5048.558749999999),
 (4144.219749999999, 5263.37575),
 (4349.0895, 5517.8905),
 (4666.4785, 5871.9065),
 (4207.335, 5351.74225),
 (4514.8365, 5607.65075),
 (4171.62975, 5368.992749999999),
 (4451.956749999999, 5493.152249999999),
 (4143.406499999999, 5457.51125),
 (4051.43175, 5172.162499999999),
 (4839.67175, 5962.884499999999),
 (4202.382250000001, 5389.59425),
 (4214.1845, 5320.179749999999),
 (4886.81325, 6048.544000000001),
 (4447.8555, 5576.9265),
 (5303.40025, 6513.046249999999),
 (5174.13925, 6269.30975),
 (4614.9695, 5731.4785),
 (4602.6175, 5846.9875),
 (4212.4415, 5394.818749999999),
 (4167.0165, 5272.1895),
 (4505.7245, 5670.1635),
 (4577.6357499999995, 5724.35225),
 (4662.41125, 5848.090249999999),
 (4915.367, 6082.162249999999),
 (4580.674000000001, 5682.724999999999),
 (4455.723, 5615.119000000001),
 (4713.34675, 5890.35175),
 (4372.821, 5423.1175),
 (3886.08725, 4924.184249999999),
 (4682.057, 5872.621999999999),
 (4118.739750000001, 5195.7432499999995),
 (4129.0869999999995, 5266.28975),
 (4177.800749999999, 5271.468250000001),
 (4019.93475, 5219.1089999999995),
 (4393.6425, 5480.432000000001),
 (4848.26325, 5898.608499999999),
 (4460.09875, 5584.812),
 (4766.09425, 5922.222999999999),
 (4312.553, 5603.2335),
 (4617.14125, 5796.33925),
 (4672.53725, 5783.181250000001),
 (4104.029750000001, 5224.316749999999),
 (4459.74475, 5630.0175),
 (4549.362749999999, 5632.365999999999),
 (4277.62575, 5307.00275),
 (3823.381, 4857.154750000001),
 (4603.9225, 5730.39725),
 (4300.3017500000005, 5477.3369999999995),
 (4069.73675, 5201.075000000001),
 (4658.26675, 5713.936),
 (4359.544249999999, 5565.4169999999995),
 (4483.3705, 5569.9929999999995),
 (4763.62525, 5843.60975),
 (4630.98125, 5740.644),
 (4225.77675, 5414.47275),
 (4317.4115, 5509.884),
 (4453.6410000000005, 5587.0427500000005),
 (4682.44775, 5741.18075),
 (4817.0245, 5928.79475),
 (3981.37425, 5125.8240000000005),
 (4367.488, 5470.49525),
 (4554.791, 5721.0335),
 (4630.5565, 5795.5265)]


plt.hist([v[0] for v in mean_ints], bins=30);
plt.hist([v[1] for v in mean_ints], bins=30);
sum([v[0] <= 5000 <= v[1] for v in mean_ints])

91


bootstrap_dist(sample_100, np.median)


ci(sample_100, np.median)

(3395.0, 6106.137499999999)


# True population median
np.median(population)

5000.0


bootstrap_dist(sample_100, np.std)


ci(sample_100, np.std)

(2777.4790016886896, 3275.346070876512)


# True population standard deviation
np.std(population)

2887.0400066504103


p99 = lambda a: np.percentile(a, 99)
p99(population)

9900.0


bootstrap_dist(sample_100, p99)


ci(sample_100, p99)

(9609.070000000002, 9998.0)


p99_ints = [ci(np.random.choice(population, 100), p99)
            for _ in tnrange(100)]

/tmp/ipykernel_1810/4008959745.py:2: TqdmDeprecationWarning: Please use `tqdm.notebook.trange` instead of `tqdm.tnrange`
  for _ in tnrange(100)]


sum([v[0] <= p99(population) <= v[1] for v in p99_ints])

67


nba = pd.read_csv('nba18-19.csv')

nba


nba_small = nba[['FG', 'FGA', 'FT%', '3PA', 'AST', 'PTS']].fillna(0)
nba_small


reg = lm.LinearRegression()
reg.fit(nba_small.iloc[:,:-1], nba_small.iloc[:,-1])

LinearRegression()


reg.score(nba_small.iloc[:,:-1], nba_small.iloc[:,-1])

0.9886417979129902


nba_small.columns

Index(['FG', 'FGA', 'FT%', '3PA', 'AST', 'PTS'], dtype='object')


reg.coef_.astype(float)

array([2.44566707, 0.03633589, 0.50454007, 0.28367305, 0.04144433])


def FGA_slope(t):
    reg = lm.LinearRegression().fit(t.iloc[:,:-1], t.iloc[:,-1])
    return reg.coef_[1]

ci(nba_small, FGA_slope)

(-0.06585682231822129, 0.14010320566346698)


nba_small.corr()


simple_model = lm.LinearRegression()
simple_model.fit(nba_small[['FGA']].values.reshape(-1, 1), nba_small['PTS'])

LinearRegression()


simple_model.score(nba_small[['FGA']].values.reshape(-1, 1), nba_small['PTS'])

0.9612756271235542


simple_model.coef_

array([1.29982787])


def FGA_slope_simple_model(t):
    reg = lm.LinearRegression().fit(t[['FGA']].values.reshape(-1, 1), t['PTS'])
    return reg.coef_[0]

ci(nba_small, FGA_slope_simple_model)

(1.2739728618051318, 1.3267754085445898)

	FG	FGA	FT%	3PA	AST	PTS
FG	1.000000	0.973355	0.371598	0.600830	0.665761	0.990014
FGA	0.973355	1.000000	0.395902	0.725114	0.703093	0.980447
FT%	0.371598	0.395902	1.000000	0.377633	0.288057	0.401555
3PA	0.600830	0.725114	0.377633	1.000000	0.480880	0.666673
AST	0.665761	0.703093	0.288057	0.480880	1.000000	0.676022
PTS	0.990014	0.980447	0.401555	0.666673	0.676022	1.000000

Lecture 22 – Data 100, Summer 2021¶

Estimation and Bootstrapping¶

Sample Mean Estimator¶

Sample variance estimator for the variance of the sample mean¶

Bootstrap estimator for the variance of the sample mean¶

Bootstrap confidence interval¶

Bootstrap confidence intervals for other population parameters¶

Median¶

Standard Deviation¶

99th Percentile¶

Estimating Parameters in Linear Regression¶

Multicollinearity¶

	Rk	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	...	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS
0	1	Álex Abrines\abrinal01	SG	25	OKC	31	2	19.0	1.8	5.1	...	0.923	0.2	1.4	1.5	0.6	0.5	0.2	0.5	1.7	5.3
1	2	Quincy Acy\acyqu01	PF	28	PHO	10	0	12.3	0.4	1.8	...	0.700	0.3	2.2	2.5	0.8	0.1	0.4	0.4	2.4	1.7
2	3	Jaylen Adams\adamsja01	PG	22	ATL	34	1	12.6	1.1	3.2	...	0.778	0.3	1.4	1.8	1.9	0.4	0.1	0.8	1.3	3.2
3	4	Steven Adams\adamsst01	C	25	OKC	80	80	33.4	6.0	10.1	...	0.500	4.9	4.6	9.5	1.6	1.5	1.0	1.7	2.6	13.9
4	5	Bam Adebayo\adebaba01	C	21	MIA	82	28	23.3	3.4	5.9	...	0.735	2.0	5.3	7.3	2.2	0.9	0.8	1.5	2.5	8.9
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
703	528	Tyler Zeller\zellety01	C	29	MEM	4	1	20.5	4.0	7.0	...	0.778	2.3	2.3	4.5	0.8	0.3	0.8	1.0	4.0	11.5
704	529	Ante Žižić\zizican01	C	22	CLE	59	25	18.3	3.1	5.6	...	0.705	1.8	3.6	5.4	0.9	0.2	0.4	1.0	1.9	7.8
705	530	Ivica Zubac\zubaciv01	C	21	TOT	59	37	17.6	3.6	6.4	...	0.802	1.9	4.2	6.1	1.1	0.2	0.9	1.2	2.3	8.9
706	530	Ivica Zubac\zubaciv01	C	21	LAL	33	12	15.6	3.4	5.8	...	0.864	1.6	3.3	4.9	0.8	0.1	0.8	1.0	2.2	8.5
707	530	Ivica Zubac\zubaciv01	C	21	LAC	26	25	20.2	3.8	7.2	...	0.733	2.3	5.3	7.7	1.5	0.4	0.9	1.4	2.5	9.4