import numpy as np
import pandas as pd
import plotly.express as px
import sklearn.linear_model as lm
import seaborn as sns
from tqdm.notebook import tqdm
Simple Bootstrap Example¶
Here we work through a simple example of the bootstap when estimating the relationship between miles per gallon and the weight of a vehicle.
Suppose we collected a sample of 20 cars from a population. For the purposes of this demo we will assume that the seaborn dataset is the population. The following is a visualization of our sample:
np.random.seed(42)
sample_size = 100
mpg = sns.load_dataset('mpg')
print("Full Data Size:", len(mpg))
mpg_sample = mpg.sample(sample_size)
print("Sample Size:", len(mpg_sample))
px.scatter(mpg_sample, x='weight', y='mpg', trendline='ols', width=800)
Full Data Size: 398 Sample Size: 100
Fitting a linear model we get an estimate of the slope:
model = lm.LinearRegression().fit(mpg_sample[['weight']], mpg_sample['mpg'])
model.coef_
array([-0.00730597])
Bootstrap Implementation¶
Now let's use bootstrap to estimate the distribution of that coefficient. Here will will construct a bootstrap function that takes an estimator function and uses that function to construct many bootstrap estimates.
def estimator(sample):
model = lm.LinearRegression().fit(sample[['weight']], sample['mpg'])
return model.coef_[0]
This code uses df.sample (link) to generate a bootstrap sample of the same size of the original sample.
def bootstrap(sample, statistic, num_repetitions):
"""
Returns the statistic computed on a num_repetitions
bootstrap samples from sample.
"""
stats = []
for i in tqdm(np.arange(num_repetitions), "Bootstrapping"):
# Step 1: Sample the Sample
bootstrap_sample = sample.sample(frac=1, replace=True)
# Step 2: compute statistics on the sample of the sample
bootstrap_stat = statistic(bootstrap_sample)
# Accumulate the statistics
stats.append(bootstrap_stat)
return stats
Constructing MANY bootstrap slope estimates.
bs_thetas = bootstrap(mpg_sample, estimator, 10000)
Bootstrapping: 0%| | 0/10000 [00:00<?, ?it/s]
We can visualize the bootstrap distribution of the slope estimates.
fig = px.histogram(pd.Series(bs_thetas, name="Bootstrap Distribution"),
title='Bootstrap Distribution of the Slope',
width=800, histnorm='probability',
barmode="overlay", opacity=0.8)
fig.add_vline(0)
Computing a Bootstrap CI¶
We can compute the CI using the percentiles of the empirical distribution:
def bootstrap_ci(bootstrap_samples, confidence_level=95):
"""
Returns the confidence interval for the bootstrap samples.
"""
lower_percentile = (100 - confidence_level) / 2
upper_percentile = 100 - lower_percentile
# using numpy percentile function to compute ci
return np.percentile(bootstrap_samples, [lower_percentile, upper_percentile])
bootstrap_ci(bs_thetas)
array([-0.00814752, -0.00653232])
ci_line_style = dict(color="orange", width=2, dash="dash")
fig.add_vline(x=bootstrap_ci(bs_thetas)[0], line=ci_line_style)
fig.add_vline(x=bootstrap_ci(bs_thetas)[1], line=ci_line_style)
Comparing to the Population CIs¶
In practice you don't have access to the population but in this specific example we had taken a sample from a larger dataset that we can pretend is the population. Let's compare to resampling from the larger dataset:
mpg_pop = sns.load_dataset('mpg')
theta_est = [estimator(mpg_pop.sample(sample_size)) for i in tqdm(range(10000))]
0%| | 0/10000 [00:00<?, ?it/s]
print("Actual CI", bootstrap_ci(theta_est))
fig.add_histogram(x=theta_est, name='Population Distribution', histnorm='probability', opacity=0.7)
fig.add_vline(x=bootstrap_ci(theta_est)[0], line=dict(color="red", width=2, dash="dash"))
fig.add_vline(x=bootstrap_ci(theta_est)[1], line=dict(color="red", width=2, dash="dash"))
Actual CI [-0.00852071 -0.00691023]
Visualizing the two distributions:
thetas = pd.DataFrame({"bs_thetas": bs_thetas, "thetas": theta_est})
px.histogram(thetas.melt(), x='value', facet_row='variable',
title='Distribution of the Slope', width=800)