import numpy as np
import pandas as pd
import plotly.express as px
import sklearn.linear_model as lm
import seaborn as sns

np.random.seed(42)
mpg_sample = sns.load_dataset('mpg').sample(20)
px.scatter(mpg_sample, x='weight', y='mpg', trendline='ols', width=800)

model = lm.LinearRegression().fit(mpg_sample[['weight']], mpg_sample['mpg'])
model.coef_

array([-0.00692182])

def estimator(sample):
    model = lm.LinearRegression().fit(sample[['weight']], sample['mpg'])
    return model.coef_[0]

def bootstrap(sample, statistic, num_repetitions):
    """
    Returns the statistic computed on a num_repetitions  
    bootstrap samples from sample.
    """
    stats = []
    for i in np.arange(num_repetitions):
        # Step 1: Sample the Sample
        bootstrap_sample = sample.sample(frac=1, replace=True)
        # Step 2: compute statistics on the sample of the sample
        bootstrap_stat = statistic(bootstrap_sample)
        # Accumulate the statistics
        stats.append(bootstrap_stat)
    return stats

bs_thetas = bootstrap(mpg_sample, estimator, 10000)

px.histogram(bs_thetas, title='Bootstrap Distribution of the Slope', 
             width=800)

def bootstrap_ci(bootstrap_samples, confidence_level=95):
    """
    Returns the confidence interval for the bootstrap samples.
    """
    lower_percentile = (100 - confidence_level) / 2
    upper_percentile = 100 - lower_percentile
    return np.percentile(bootstrap_samples, [lower_percentile, upper_percentile])

bootstrap_ci(bs_thetas)

array([-0.00861209, -0.0055226 ])

mpg_pop = sns.load_dataset('mpg')
theta_est = [estimator(mpg_pop.sample(20)) for i in range(10000)]

bootstrap_ci(theta_est)

array([-0.01017473, -0.00574973])

thetas = pd.DataFrame({"bs_thetas": bs_thetas, "thetas": theta_est})
px.histogram(thetas.melt(), x='value', facet_row='variable', 
             title='Distribution of the Slope', width=800)

csv_file = 'data/Full24hrdataset.csv'
usecols = ['Date', 'ID', 'region', 'PM25FM', 'PM25cf1', 'TempC', 'RH', 'Dewpoint']
full_df = pd.read_csv(csv_file, usecols=usecols, parse_dates=['Date']).dropna()
full_df.columns = ['date', 'id', 'region', 'pm25aqs', 'pm25pa', 'temp', 'rh', 'dew']
full_df = full_df[(full_df['pm25aqs'] < 50)]
# drop dates with issues in the data
bad_dates = ['2019-08-21', '2019-08-22', '2019-09-24']
GA = full_df[(full_df['id'] == 'GA1') & (~full_df['date'].isin(bad_dates))]
GA = GA.sort_values("pm25aqs")

AQS, PA = GA[['pm25aqs']], GA['pm25pa']

model = lm.LinearRegression().fit(AQS, PA)
theta_0, theta_1 = model.intercept_, model.coef_[0],

fig = px.scatter(GA, x='pm25aqs', y='pm25pa', width=800)
xtest = np.array([GA['pm25aqs'].min(), GA['pm25aqs'].max()])
fig.add_scatter(x=xtest, y=model.predict(xtest.reshape(-1, 1)), mode='lines', 
                name="Least Squares Fit")

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/base.py:439: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

print(f"True Air Quality Estimate = {-theta_0/theta_1:.2} + {1/theta_1:.2}PA")

True Air Quality Estimate = 1.6 + 0.46PA

model2 = lm.LinearRegression().fit(GA[['pm25pa']], GA['pm25aqs'])

fig = px.scatter(GA, y='pm25aqs', x='pm25pa', width=800)
xtest = np.array([GA['pm25pa'].min(), GA['pm25pa'].max()])
fig.add_scatter(x=xtest, y=xtest *1/theta_1 - theta_0/theta_1 , mode='lines', 
                name="Inverse Fit")
fig.add_scatter(x=xtest, y=model2.predict(xtest.reshape(-1, 1)), mode='lines',
                name="Least Squares Fit")

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/base.py:439: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

AQS_RH, PA = GA[['pm25aqs', 'rh']], GA['pm25pa']
model_h = lm.LinearRegression().fit(AQS_RH, PA)
[theta_1, theta_2], theta_0 = model_h.coef_, model_h.intercept_
    
print(f"True Air Quality Estimate = {-theta_0/theta_1:1.2} + {1/theta_1:.2}PA + {-theta_2/theta_1:.2}RH")

True Air Quality Estimate = 7.0 + 0.44PA + -0.092RH

fig = px.scatter(GA, x='pm25aqs', y='pm25pa', width=800)
xtest = np.array([GA['pm25aqs'].min(), GA['pm25aqs'].max()])
fig.add_scatter(x=xtest, y=model.predict(xtest.reshape(-1, 1)), mode='lines', 
                name="Least Squares Fit")
fig.add_scatter(x=GA["pm25aqs"], y=model_h.predict(AQS_RH), mode='lines+markers',
                marker_size=5, name="Least Squares Fit with RH")

/srv/conda/envs/notebook/lib/python3.11/site-packages/sklearn/base.py:439: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

fig = px.scatter_3d(GA, x='pm25aqs', y='rh', z='pm25pa', width=800, height=600)

grid_resolution = 2
(u,v) = np.meshgrid(
    np.linspace(GA["pm25aqs"].min(), GA["pm25aqs"].max(), grid_resolution),
    np.linspace(GA["rh"].min(), GA["rh"].max(), grid_resolution))
zs = model_h.predict(pd.DataFrame({"pm25aqs": u.flatten(), "rh": v.flatten()}))
zs_old = model.predict(pd.DataFrame({"pm25aqs": u.flatten()}))
# create the Surface
color1 = px.colors.qualitative.Plotly[3]
color2 = px.colors.qualitative.Plotly[4]
fig.add_surface(x=u, y=v, z= zs.reshape(u.shape), opacity=1, 
                colorscale=[[0, color1], [1,color1]],
                showscale=False, name="AQS + RH")
fig.add_surface(x=u, y=v, z= zs_old.reshape(u.shape), opacity=1, 
                colorscale=[[0, color2], [1,color2]],
                showscale=False, name="AQS")
# set the aspect ratio of the 3d plot
fig.update_scenes(aspectmode='cube')

theta_1

2.2540167939150546

theta_2

0.2063010877555531

def theta2_estimate(sample):
    model = lm.LinearRegression().fit(sample[['pm25aqs', 'rh']], sample['pm25pa'])
    return model.coef_[1]

bs_theta2 = bootstrap(GA, theta2_estimate, 10000)

import plotly.express as px
px.histogram(x=bs_theta2,
            labels=dict(x='Bootstrapped Humidity Coefficient'),
            width=800)

len([elem for elem in bs_theta2 if elem < 0.0])

0

eggs = pd.read_csv('data/snowy_plover.csv')
eggs.head()

eggs.shape

(44, 4)

y = eggs["bird_weight"]
X = eggs[["egg_weight", "egg_length", "egg_breadth"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame(
    [model.intercept_] + list(model.coef_),
    columns=['theta_hat'],
    index=['intercept', 'egg_weight', 'egg_length', 'egg_breadth']))

print("RMSE", np.mean((y - model.predict(X)) ** 2))

RMSE 0.04547085380275766

def all_thetas(sample):
    # first feature
    model = lm.LinearRegression().fit(
        sample[["egg_weight", "egg_length", "egg_breadth"]],
        sample["bird_weight"])
    return [model.intercept_] + model.coef_.tolist()

bs_thetas = pd.DataFrame(
    bootstrap(eggs, all_thetas, 10_000), 
    columns=['intercept', 'egg_weight', 'egg_length', 'egg_breadth'])
bs_thetas

(bs_thetas.apply(bootstrap_ci)
    .T
    .rename(columns={0: 'lower', 1: 'upper'}))

px.scatter_matrix(eggs, width=600, height=600)

px.imshow(eggs.corr().round(2), text_auto=True, width=600)

px.scatter(eggs, x='egg_weight', y='bird_weight', trendline='ols', width=800)

y = eggs["bird_weight"]
X = eggs[["egg_weight"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight']))
print("RMSE", np.mean((y - model.predict(X)) ** 2))

RMSE 0.0464939413755568

def egg_weight_coeff(sample):
    # first feature
    model = lm.LinearRegression().fit(
        sample[["egg_weight"]],
        sample["bird_weight"])
    return [model.intercept_] + model.coef_.tolist()

bs_thetas_egg_weight = pd.DataFrame(
    bootstrap(eggs, egg_weight_coeff, 10_000), 
    columns=['intercept', 'egg_weight'])
bs_thetas_egg_weight

(bs_thetas_egg_weight.apply(bootstrap_ci)
    .T
    .rename(columns={0: 'lower', 1: 'upper'}))

	egg_weight	egg_length	egg_breadth	bird_weight
0	7.4	28.80	21.84	5.2
1	7.7	29.04	22.45	5.4
2	7.9	29.36	22.48	5.6
3	7.5	30.10	21.71	5.3
4	8.3	30.17	22.75	5.9

	intercept	egg_weight	egg_length	egg_breadth
0	2.735836	0.933480	-0.085565	-0.082362
1	-7.963464	0.293276	0.098463	0.372370
2	-0.571565	0.439917	0.045989	0.065429
3	-5.420348	0.319199	0.071474	0.285354
4	-9.009859	0.084618	0.113050	0.474270
...	...	...	...	...
9995	-7.254253	0.413497	0.068339	0.336043
9996	-1.593437	0.638769	0.032167	0.053375
9997	-1.076720	0.714224	-0.012315	0.064064
9998	-7.512390	0.362088	0.092687	0.333736
9999	-5.156515	0.427053	0.041968	0.273362

	lower	upper
intercept	-15.474294	5.035731
egg_weight	-0.265432	1.093108
egg_length	-0.097623	0.211462
egg_breadth	-0.256738	0.760643

	intercept	egg_weight
0	0.190292	0.686176
1	-0.638902	0.785541
2	0.186698	0.688066
3	-0.125817	0.723262
4	0.285226	0.676833
...	...	...
9995	-0.049395	0.716097
9996	-0.024848	0.712183
9997	1.043560	0.584261
9998	-0.830942	0.815550
9999	-0.099581	0.725063

Lecture 19 – Data 100, Spring 2024¶

Simple Bootstrap Example¶

Bootstrap Implementation¶

Computing a Bootstrap CI¶

Comparing to the Population CIs¶

PurpleAir¶

Inverse Regression¶

The Barkjohn et al. model with Relative Humidity¶

Bootstrapping the regression coefficients for Purple Air¶

The Snowy Plover¶

The Data¶

Inspecting the Relationship between Features¶

Changing Our Modeling Features¶

	theta_hat
intercept	-4.605670
egg_weight	0.431229
egg_length	0.066570
egg_breadth	0.215914

	lower	upper
intercept	-0.904254	0.937528
egg_weight	0.601164	0.820243