import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model as lm

# big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
#plt.style.use('default') # revert style to default mpl
adjust_fontsize(size=20)
%matplotlib inline

csv_file = 'data/Full24hrdataset.csv'
usecols = ['Date', 'ID', 'region', 'PM25FM', 'PM25cf1', 'TempC', 'RH', 'Dewpoint']
full_df = (pd.read_csv(csv_file, usecols=usecols, parse_dates=['Date'])
        .dropna())
full_df.columns = ['date', 'id', 'region', 'pm25aqs', 'pm25pa', 'temp', 'rh', 'dew']
full_df = full_df.loc[(full_df['pm25aqs'] < 50)]


bad_dates = ['2019-08-21', '2019-08-22', '2019-09-24']
GA = full_df.loc[(full_df['id'] == 'GA1') & (~full_df['date'].isin(bad_dates)) , :]

from sklearn.linear_model import LinearRegression

AQS, PA = GA[['pm25aqs']], GA['pm25pa']
    
model = LinearRegression().fit(AQS, PA)
theta_1, theta_0 = model.coef_[0], model.intercept_

print(f"True Air Quality Estimate = {-theta_0/theta_1:.2} + {1/theta_1:.2}PA")

True Air Quality Estimate = 1.6 + 0.46PA

from sklearn.metrics import mean_squared_error

preds_slr = model.predict(AQS)
mean_squared_error(PA, preds_slr)

4.7083124633807225

AQS_RH, PA = GA[['pm25aqs', 'rh']], GA['pm25pa']
model_h = LinearRegression().fit(AQS_RH, PA)
[theta_1, theta_2], theta_0 = model_h.coef_, model_h.intercept_
    
print(f"True Air Quality Estimate = {-theta_0/theta_1:1.2} + {1/theta_1:.2}PA + {-theta_2/theta_1:.2}RH")

True Air Quality Estimate = 7.0 + 0.44PA + -0.092RH

preds_humidity = model_h.predict(AQS_RH)
mean_squared_error(PA, preds_humidity)

3.2977168948380413

theta_1

2.2540167939150537

theta_2

0.20630108775555353

n = len(GA)           # n: size of our sample
def boot_stat(X, y):
    r = randint.rvs(low=0, high=(n-1), size=n)
    
    theta2 = LinearRegression().fit(X.iloc[r, :], y.iloc[r]).coef_[1]
    
    return theta2

from scipy.stats import randint


n = len(GA)
y = GA['pm25pa']
X = GA[['pm25aqs', 'rh']]

boot_stat(X, y)

0.2605392470192046

rng = np.random.default_rng(42)

boot_theta_hat = [boot_stat(X, y) for _ in range(10000)]

import plotly.express as px
px.histogram(x=boot_theta_hat, nbins=50,
            labels=dict(x='Bootstrapped Humidity Coefficient'),
            width=350, height=250)

len([elem for elem in boot_theta_hat if elem < 0.0])

0

eggs = pd.read_csv('data/snowy_plover.csv')
eggs.head()

eggs.shape

(44, 4)

y = eggs["bird_weight"]
X = eggs[["egg_weight", "egg_length", "egg_breadth"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight', 'egg_length', 'egg_breadth']))

print("RMSE", np.mean((y - model.predict(X)) ** 2))

RMSE 0.045470853802757734

def get_param1(model):
    # first feature
    return model.coef_[0]

def bootstrap_params(sample_df, get_param_fn=get_param1, n_iters=10000):
    """
    sample: the bootstrap population
    """
    n = len(sample_df)
    estimates = []
    for i in range(n_iters):
        # resample n times with replacement
        # i.e., get a new sample of same size
        # using df.sample(...)
        resample = sample_df.sample(n, replace=True)
        
        # train model with this bootstrap sample
        resample_y = resample["bird_weight"]
        resample_X = resample[["egg_weight", "egg_length", "egg_breadth"]]
        model = lm.LinearRegression()
        model.fit(resample_X, resample_y)
        
        # include the estimate
        estimate = get_param_fn(model)
        estimates.append(estimate)
    lower = np.percentile(estimates, 2.5, axis=0)
    upper = np.percentile(estimates, 97.5, axis=0)
    conf_interval = (lower, upper)
    return conf_interval

approx_conf1 = bootstrap_params(eggs, get_param1)
approx_conf1

(-0.262146405136818, 1.114589904466114)

def get_all_params(model):
    # all features
    return [model.intercept_] + list(model.coef_)

approx_confs = bootstrap_params(eggs, get_param_fn=get_all_params)
approx_confs

(array([-15.33124598,  -0.27914779,  -0.1006884 ,  -0.26749613]),
 array([5.37409587, 1.12602753, 0.21484213, 0.76193654]))

def simple_resample(n): 
    return np.random.randint(low=0, high=n, size=n)

def bootstrap(boot_pop, statistic, resample=simple_resample, replicates=10000):
    n = len(boot_pop)
    resample_estimates = [statistic(boot_pop[resample(n)])
                          for _ in range(replicates)]
    return np.array(resample_estimates)

def egg_thetas(data):
    X = data[:, :3]
    y = data[:, 3]
    
    model = lm.LinearRegression().fit(X, y)
    return model.coef_

egg_thetas = bootstrap(eggs.values, egg_thetas)

egg_ci = np.percentile(egg_thetas, [2.5, 97.5], axis=0)
pd.DataFrame(egg_ci.T,
             columns=['lower', 'upper'],
             index=['theta_egg_weight', 'theta_egg_length', 'theta_egg_breadth'])

px.scatter_matrix(eggs, width=450, height=450)

eggs.corr().round(2)

y = eggs["bird_weight"]
X = eggs[["egg_weight"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight']))
print("RMSE", np.mean((y - model.predict(X)) ** 2))

RMSE 0.046493941375556846

def bootstrap_egg_weight_only(sample_df, n_iters=10000):
    """
    copied over for convenience
    """
    n = len(sample_df)
    estimates = []
    for i in range(n_iters):
        resample = sample_df.sample(n, replace=True)
        
        resample_y = resample["bird_weight"]
        resample_X = resample[["egg_weight"]] # just one feature + intercept
        model = lm.LinearRegression()
        model.fit(resample_X, resample_y)
        estimates.append( model.coef_[0])
    lower = np.percentile(estimates, 2.5, axis=0)
    upper = np.percentile(estimates, 97.5, axis=0)
    conf_interval = (lower, upper)
    return conf_interval

approx_conf_egg_weight_only = bootstrap_egg_weight_only(eggs)
approx_conf_egg_weight_only

(0.6047539795814336, 0.8198989904872578)

	egg_weight	egg_length	egg_breadth	bird_weight
0	7.4	28.80	21.84	5.2
1	7.7	29.04	22.45	5.4
2	7.9	29.36	22.48	5.6
3	7.5	30.10	21.71	5.3
4	8.3	30.17	22.75	5.9

	lower	upper
theta_egg_weight	-0.269277	1.112453
theta_egg_length	-0.102179	0.213119
theta_egg_breadth	-0.262963	0.760018

	egg_weight	egg_length	egg_breadth	bird_weight
egg_weight	1.00	0.79	0.84	0.85
egg_length	0.79	1.00	0.40	0.68
egg_breadth	0.84	0.40	1.00	0.73
bird_weight	0.85	0.68	0.73	1.00

Lecture 19 – Data 100, Fall 2023¶

PurpleAir¶

Is there a better model? Relative Humidity¶

Bootstrapping the regression coefficients¶

The Snowy Plover¶

The Data¶

Testing all the coefficients¶

Inspecting the Relationship between Features¶

Changing Our Modeling Features¶

	theta_hat
intercept	-4.605670
egg_weight	0.431229
egg_length	0.066570
egg_breadth	0.215914