import numpy as np
import pandas as pd
import plotly.express as px
import sklearn.linear_model as lm
import seaborn as sns
from tqdm.notebook import tqdm

heights = np.array([68, 67, 69, 66, 66, 66, 71, 72, 61, 70])

heights.mean()

67.6

# Set seed for reproducibility
np.random.seed(100)

sample_size = len(heights)

# Resample n values with replacement from our real sample of 10 heights
synth_heights = np.random.choice(heights, size=sample_size, replace=True)

# Compute the mean of the synthetic sample
synth_estimate = synth_heights.mean()

print("Original Heights:")
print(heights)
print("Mean of Original Heights:")
print(heights.mean())
print()
print("Synthetic Heights:")
print(synth_heights)
print("Mean of Synthetic Heights:")
print(synth_estimate)

Original Heights:
[68 67 69 66 66 66 71 72 61 70]
Mean of Original Heights:
67.6

Synthetic Heights:
[61 61 66 72 72 68 66 69 66 69]
Mean of Synthetic Heights:
67.0

sample_size = len(heights)
n_boot = 10000

# Create an empty array to hold the 10,000 synthetic "best guesses"
synth_estimates = np.zeros(n_boot)

for i in range(n_boot):

  # Resample n values with replacement from our real sample of 10 heights
  synth_heights = np.random.choice(heights, size=sample_size, replace=True)

  # Compute the mean of the synthetic sample
  synth_estimate = synth_heights.mean()

  # Append the synthetic mean to synth_estimates
  synth_estimates[i] = synth_estimate

print('Number of synthetic best guesses:')
print(len(synth_estimates))

print('First 5 synthetic best guesses:')
print(synth_estimates[:5])

Number of synthetic best guesses:
10000
First 5 synthetic best guesses:
[67.8 66.9 68.3 67.8 66. ]

fig = px.histogram(pd.Series(synth_estimates), 
            title='Bootstrap Distribution of the Sample Mean Height', 
            width=800, histnorm='probability', 
            barmode="overlay", opacity=0.8)
fig.show()

# Grab the 2.5th and 97.5th percentiles of the synthetic sample means
ci_bounds = np.percentile(synth_estimates, [2.5, 97.5])

print("Lower bound of 95% CI: ", ci_bounds[0])

print("Upper bound of 95% CI: ", ci_bounds[1])

fig.add_vline(x=ci_bounds[0], line_color='red')
fig.add_vline(x=ci_bounds[1], line_color='red')
fig.add_annotation(x=ci_bounds[0], y=0.02, text="Lower Bound", showarrow=True, arrowhead=2)
fig.add_annotation(x=ci_bounds[1], y=0.02, text="Upper Bound", showarrow=True, arrowhead=2)
fig.show()

Lower bound of 95% CI:  65.6
Upper bound of 95% CI:  69.4

fig.add_vline(x=68, line_color='green')
fig.add_annotation(x=68, y=0.02, text="Null population mean", showarrow=True, arrowhead=2)
fig.add_annotation(x=ci_bounds[1], y=0.02, text="Upper Bound", showarrow=True, arrowhead=2)
fig.show()

# Estimate expectation using a mean:
estimated_expectation_theta_hat = synth_estimates.mean()

# Same idea for estimated variance
estimated_variance_theta_hat = np.mean((synth_estimates - estimated_expectation_theta_hat) ** 2)

print("Estimated variance of the synthetic sample means: ", estimated_variance_theta_hat)

Estimated variance of the synthetic sample means:  0.9209339375

# np.var directly computes the variance of an array, so we get the same answer!
sample_variance_est = np.var(synth_estimates)
print("Estimated variance of the synthetic sample means: ", sample_variance_est)

Estimated variance of the synthetic sample means:  0.9209339375

# Set seed for reproducibility
np.random.seed(42)

# Number of cars to sample
sample_size = 20

# Load in the mpg from seaborn
mpg = sns.load_dataset('mpg')
print("Full Data Size:", len(mpg))

# Sample `sample_size` rows from the mpg dataset
mpg_sample = mpg.sample(sample_size)
print("Sample Size:", len(mpg_sample))

px.scatter(mpg_sample, x='weight', y='mpg', trendline='ols', width=800)

Full Data Size: 398
Sample Size: 20

model = lm.LinearRegression().fit(mpg_sample[['weight']], mpg_sample['mpg'])

print("Slope of the regression line: ", model.coef_[0])
print("Intercept of the regression line: ", model.intercept_)

Slope of the regression line:  -0.0069218218719096815
Intercept of the regression line:  43.529512519963816

def estimator(sample):
    """
    Fits an SLR to `sample` regressing mpg on weight, 
    and returns the slope of the fitted line
    """
    model = lm.LinearRegression().fit(sample[['weight']], sample['mpg'])

    return (model.intercept_, model.coef_[0])

estimator(mpg_sample)

(43.529512519963816, -0.0069218218719096815)

print("Slope of the regression line: ", model.coef_[0])
print("Intercept of the regression line: ", model.intercept_)

Slope of the regression line:  -0.0069218218719096815
Intercept of the regression line:  43.529512519963816

def bootstrap(sample, statistic, num_repetitions):
    """
    Returns the statistic computed on a num_repetitions  
    bootstrap samples from sample.
    """
    stats = []

    # tqdm provides a progress bar
    # functionally, this code is the same as `for i in np.arange(num_repetitions)`
    for i in tqdm(np.arange(num_repetitions), "Bootstrapping"):
        
        # Step 1: Resample with replacement from our original sample to generate
        # a synthetic sample of the same size
        bootstrap_sample = sample.sample(frac=1, replace=True)

        # Step 2: Calculate a synthetic estimate using the synthetic sample
        bootstrap_stat = statistic(bootstrap_sample)

        # Append the synthetic estimate to the list of estimates
        stats.append(bootstrap_stat)
        
    return stats

bs_thetas = bootstrap(mpg_sample, estimator, 1000)
print("Number of bootstrap estimates:", len(bs_thetas))
print("First 5 bootstrap estimates:", bs_thetas[:5])

Bootstrapping:   0%|          | 0/1000 [00:00<?, ?it/s]

Number of bootstrap estimates: 1000
First 5 bootstrap estimates: [(43.215970007392926, -0.006714853868176342), (45.26126846596228, -0.007064625623981777), (46.602470109629174, -0.007653151422012398), (45.84303440370839, -0.007484470553621759), (46.17677039482411, -0.007577745379669565)]

# Plot the SLR models given in bs_thetas
# Create a DataFrame from the list of tuples

# Make a scatterplot of the original data
fig = px.scatter(mpg_sample, x='weight', y='mpg', trendline='ols', width=800)

for theta in bs_thetas:
    
    # Unpack the tuple
    intercept, slope = theta

    # Create a line from the intercept and slope
    x = np.linspace(1500, 5000, 100)
    y = intercept + slope * x

    # Plot the lines transparently
    fig.add_scatter(x=x, y=y, mode='lines', line=dict(width=0.05))

fig.update_layout(title='Bootstrapped SLR Models')

fig.show()

# Grab the slopes from the list of (intercept, slope) tuples
bs_theta1s = [theta[1] for theta in bs_thetas]

fig = px.histogram(pd.Series(bs_theta1s),
                   title='Bootstrap Distribution of the Slope', 
                   width=800, histnorm='probability', 
                   barmode="overlay", opacity=0.8)
fig.show()

def bootstrap_ci(bootstrap_estimates, confidence_level=95):
    """
    Returns the confidence interval for the synthetic estimates by grabbing
    the percentiles corresponding to `confidence_level`% of the samples
    """
    lower_percentile = (100 - confidence_level) / 2
    upper_percentile = 100 - lower_percentile

    # np.percentile grabs the given percentiles of an array
    return np.percentile(bootstrap_estimates, [lower_percentile, upper_percentile])

print(bootstrap_ci(bs_theta1s))

[-0.00858358 -0.00549309]

ci_line_style = dict(color="orange", width=2, dash="dash")
fig.add_vline(x=bootstrap_ci(bs_theta1s)[0], line=ci_line_style)
fig.add_vline(x=bootstrap_ci(bs_theta1s)[1], line=ci_line_style)

csv_file = 'data/Full24hrdataset.csv.gz'
usecols = ['Date', 'ID', 'region', 'PM25FM', 'PM25cf1', 'TempC', 'RH', 'Dewpoint']
full_df = pd.read_csv(csv_file, usecols=usecols, parse_dates=['Date']).dropna()
full_df.columns = ['date', 'id', 'region', 'pm25aqs', 'pm25pa', 'temp', 'rh', 'dew']
full_df = full_df[(full_df['pm25aqs'] < 50)]
# drop dates with issues in the data
bad_dates = pd.to_datetime(['2019-08-21', '2019-08-22', '2019-09-24'])

# GA is the DataFrame that contains air quality measurements
GA = full_df[(full_df['id'] == 'GA1') & (~full_df['date'].isin(bad_dates))]
GA = GA.sort_values("pm25aqs")
display(full_df["region"].value_counts())
display(GA.head())
print("Number of Rows:", GA.shape[0])

region
North                5592
West                 3750
Central Southwest    1502
Southeast            1032
Alaska                365
Name: count, dtype: int64

Number of Rows: 176

# Fit an SLR predicting purple air measurements from AQS measurements
model = lm.LinearRegression().fit(GA[['pm25aqs']], GA['pm25pa'])
theta_0, theta_1 = model.intercept_, model.coef_[0],

# pm25 is a measure of air quality. pm stands for "particulate matter".
fig = px.scatter(GA, x='pm25aqs', y='pm25pa', width=800)

# This code adds the SLR fit to the scatterplot. Don't worry about the details of this code.
xtest = pd.DataFrame({"pm25aqs": np.array([GA['pm25aqs'].min(), GA['pm25aqs'].max()])})
fig.add_scatter(x=xtest["pm25aqs"], y=model.predict(xtest[["pm25aqs"]]), mode='lines', 
                name="Least Squares Fit")

print(f"True Air Quality Estimate = {-theta_0/theta_1:.2} + {1/theta_1:.2}PA")

True Air Quality Estimate = 1.6 + 0.46PA

# This code adds the inverse fit to the scatterplot. 
# It may look like we are fitting a new model, but we are just creating a model
# object to make it easier to plot the inverse fit.
# Don't worry about the details of this code.
fig = px.scatter(GA, y='pm25aqs', x='pm25pa', width=800)
model2 = lm.LinearRegression().fit(GA[['pm25pa']], GA['pm25aqs'])
xtest["pm25pa"] = np.array([GA['pm25pa'].min(), GA['pm25pa'].max()])
fig.add_scatter(x=xtest["pm25pa"], y=xtest["pm25pa"] *1/theta_1 - theta_0/theta_1 , mode='lines', 
                name="Inverse Fit")
fig.add_scatter(x=xtest["pm25pa"], y=model2.predict(xtest[['pm25pa']]), mode='lines',
                name="Least Squares Fit")

model_h = lm.LinearRegression().fit(GA[['pm25aqs', 'rh']], GA['pm25pa'])
[theta_1, theta_2], theta_0 = model_h.coef_, model_h.intercept_

print(f"True Air Quality Estimate = {-theta_0/theta_1:1.2} + {1/theta_1:.2}PA + {-theta_2/theta_1:.2}RH")

True Air Quality Estimate = 7.0 + 0.44PA + -0.092RH

print(f"True Air Quality Estimate = {-theta_0/theta_1:.2} + {1/theta_1:.2}PA")

True Air Quality Estimate = 7.0 + 0.44PA

theta_1

2.2540167939150546

theta_2

0.20630108775555359

def theta2_estimate(sample):
    model = lm.LinearRegression().fit(sample[['pm25aqs', 'rh']], sample['pm25pa'])
    return model.coef_[1]

bs_theta2 = bootstrap(GA, theta2_estimate, 1000)

Bootstrapping:   0%|          | 0/1000 [00:00<?, ?it/s]

import plotly.express as px
fig = px.histogram(x=bs_theta2,
                   labels=dict(x='Bootstrapped Humidity Coefficient'),
                   histnorm='probability', 
                   width=800)
fig.add_vline(0)
fig.add_vline(x=bootstrap_ci(bs_theta2)[0], line=ci_line_style)
fig.add_vline(x=bootstrap_ci(bs_theta2)[1], line=ci_line_style)

len([elem for elem in bs_theta2 if elem < 0.0])

0

eggs = pd.read_csv('data/snowy_plover.csv.gz')
eggs.head()

eggs.shape

(44, 4)

y = eggs["bird_weight"]
X = eggs[["egg_weight", "egg_length", "egg_breadth"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame(
    [model.intercept_] + list(model.coef_),
    columns=['theta_hat'],
    index=['intercept', 'egg_weight', 'egg_length', 'egg_breadth']))

all_features_rmse = np.mean((y - model.predict(X)) ** 2)

print("RMSE", all_features_rmse)

RMSE 0.045470853802757616

# This function returns a list of the coefficients of the fitted model
def all_thetas(sample):
    model = lm.LinearRegression().fit(
        sample[["egg_weight", "egg_length", "egg_breadth"]],
        sample["bird_weight"])
    return [model.intercept_] + model.coef_.tolist()

bs_thetas = pd.DataFrame(
    bootstrap(eggs, all_thetas, 1000), 
    columns=['intercept', 'egg_weight', 'egg_length', 'egg_breadth'])

bs_thetas

Bootstrapping:   0%|          | 0/1000 [00:00<?, ?it/s]

cis = (bs_thetas
       .apply(bootstrap_ci).T
       .rename(columns={0: 'lower', 1: 'upper'}))
cis

def visualize_coeffs(bs_thetas, rows, cols):
    cis = (bs_thetas
       .apply(bootstrap_ci).T
       .rename(columns={0: 'lower', 1: 'upper'}))
    display(cis)
    from plotly.subplots import make_subplots
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=cis.index)
    for i, coeff_name in enumerate(cis.index):
        c = (i % cols) + 1
        r = (i // cols) + 1
        fig.add_histogram(x=bs_thetas[coeff_name], name=coeff_name, 
                        row=r, col=c, histnorm='probability')
        fig.add_vline(x=0, row=r, col=c)
        fig.add_vline(x=cis.loc[coeff_name, 'lower'], line=ci_line_style, 
                      row=r, col=c)
        fig.add_vline(x=cis.loc[coeff_name, 'upper'], line=ci_line_style, 
                      row=r, col=c)
    return fig

visualize_coeffs(bs_thetas, 2, 2)

px.scatter_matrix(eggs, width=600, height=600)

px.imshow(eggs.corr().round(2), text_auto=True, width=600)

y = eggs["bird_weight"]
X = eggs[["egg_weight"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight']))
print("All Features RMSE: ", all_features_rmse)
print("Simpler model RMSE: ", np.mean((y - model.predict(X)) ** 2))

All Features RMSE:  0.045470853802757616
Simpler model RMSE:  0.046493941375556846

# Return a list of the intercept and slope of the SLR model using just egg_weight
def egg_weight_coeff(sample):
    model = lm.LinearRegression().fit(
        sample[["egg_weight"]],
        sample["bird_weight"])
    return [model.intercept_] + model.coef_.tolist()

bs_thetas_egg_weight = pd.DataFrame(
    bootstrap(eggs, egg_weight_coeff, 1000), 
    columns=['intercept', 'egg_weight'])
bs_thetas_egg_weight

Bootstrapping:   0%|          | 0/1000 [00:00<?, ?it/s]

visualize_coeffs(bs_thetas_egg_weight, 1, 2)

	date	id	region	pm25aqs	pm25pa	temp	rh	dew
5416	2019-10-31	GA1	Southeast	3.100000	7.638554	19.214186	70.443672	13.674061
5401	2019-10-09	GA1	Southeast	4.200000	10.059924	24.621388	57.696801	15.708347
5407	2019-10-17	GA1	Southeast	4.200000	6.389826	16.641975	49.377778	5.921212
5411	2019-10-23	GA1	Southeast	4.300000	4.544160	16.963735	50.861111	6.650425
5325	2019-10-23	GA1	Southeast	4.304167	4.544160	16.963735	50.861111	6.650425

	egg_weight	egg_length	egg_breadth	bird_weight
0	7.4	28.80	21.84	5.2
1	7.7	29.04	22.45	5.4
2	7.9	29.36	22.48	5.6
3	7.5	30.10	21.71	5.3
4	8.3	30.17	22.75	5.9

	intercept	egg_weight	egg_length	egg_breadth
0	-8.103400	0.216451	0.098032	0.405838
1	-11.074321	0.077284	0.148533	0.521490
2	-2.718266	0.608443	0.035361	0.111554
3	-1.080113	0.597081	0.026363	0.053714
4	-2.124909	0.663734	0.002459	0.107473
...	...	...	...	...
995	0.034319	0.965715	-0.062090	-0.009347
996	-6.894141	0.288500	0.066708	0.368040
997	-0.395814	0.695542	-0.032516	0.068442
998	-7.291577	0.499068	0.040360	0.344834
999	-4.958756	0.428270	0.047167	0.258591

	lower	upper
intercept	-14.740386	4.844829
egg_weight	-0.219311	1.082662
egg_length	-0.092170	0.208965
egg_breadth	-0.230592	0.725745

	lower	upper
intercept	-14.740386	4.844829
egg_weight	-0.219311	1.082662
egg_length	-0.092170	0.208965
egg_breadth	-0.230592	0.725745

🥾 Lecture 19 – Data 100, Spring 2025¶

⏪ Data 8 Review: Bootstrapping, Confidence Intervals, and Hypothesis Testing¶

Bootstrap confidence intervals¶

Bootstrap hypothesis testing¶

From last lecture: Bias, variance, and MSE of an estimator¶

🍝 Bootstrapping a regression coefficient¶

Bootstrap Implementation¶

Computing a Bootstrap CI¶

🟪 PurpleAir¶

Inverse Regression¶

The Barkjohn et al. model with Relative Humidity¶

Bootstrapping the regression coefficients for Purple Air¶

The Snowy Plover¶

The Data¶

Inspecting the Relationship between Features¶

Changing Our Modeling Features¶

	theta_hat
intercept	-4.605670
egg_weight	0.431229
egg_length	0.066570
egg_breadth	0.215914

	intercept	egg_weight
0	-0.108481	0.721276
1	0.178621	0.687295
2	0.131337	0.698104
3	-0.022869	0.712006
4	0.329215	0.669357
...	...	...
995	-0.776623	0.803877
996	0.130192	0.692799
997	-0.540054	0.782740
998	-0.498966	0.767973
999	-0.221662	0.735186

	lower	upper
intercept	-0.783541	0.917632
egg_weight	0.602692	0.809269