import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model as lm

# big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
#plt.style.use('default') # revert style to default mpl
adjust_fontsize(size=20)
%matplotlib inline


eggs = pd.read_csv('data/snowy_plover.csv')
eggs.head()


eggs.shape

(44, 4)


y = eggs["bird_weight"]
X = eggs[["egg_weight", "egg_length", "egg_breadth"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight', 'egg_length', 'egg_breadth']))

print("RMSE", np.mean((y - model.predict(X)) ** 2))

RMSE 0.045470853802757734


def get_param1(model):
    # first feature
    return model.coef_[0]

def bootstrap_params(sample_df, get_param_fn=get_param1, n_iters=10000):
    """
    sample: the bootstrap population
    """
    n = len(sample_df)
    estimates = []
    for i in range(n_iters):
        # resample n times with replacement
        # i.e., get a new sample of same size
        # using df.sample(...)
        resample = sample_df.sample(n, replace=True)
        
        # train model with this bootstrap resample
        resample_y = resample["bird_weight"]
        resample_X = resample[["egg_weight", "egg_length", "egg_breadth"]]
        model = lm.LinearRegression()
        model.fit(resample_X, resample_y)
        
        # include the estimate
        estimate = get_param_fn(model)
        estimates.append(estimate)
    lower = np.percentile(estimates, 2.5, axis=0)
    upper = np.percentile(estimates, 97.5, axis=0)
    conf_interval = (lower, upper)
    return (conf_interval, estimates)


approx_theta1, theta_1_hat = bootstrap_params(eggs, get_param1)
approx_theta1

(-0.25489148333439304, 1.1346303927077175)


import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(dpi=120)
sns.histplot(theta_1_hat, stat="density")
plt.xlabel(r"$\hat{\theta}_1$")
plt.title(r"Bootstrapped estimates $\hat{\theta}_1$");


def get_all_params(model):
    # all features
    return [model.intercept_] + list(model.coef_)

approx_thetas, theta_hats = bootstrap_params(eggs, get_param_fn=get_all_params)
approx_thetas

(array([-15.45006327,  -0.25594971,  -0.09781221,  -0.25844941]),
 array([5.20832913, 1.10380458, 0.2103728 , 0.75463279]))


pd.DataFrame(np.array([approx_thetas[0], approx_thetas[1]]).T,
             columns=['lower', 'upper'],
             index=['intercept','theta_egg_weight', 'theta_egg_length', 'theta_egg_breadth'])


sns.pairplot(eggs[["egg_length", "egg_breadth", "egg_weight"]]);


eggs.corr().round(2)


y = eggs["bird_weight"]
X = eggs[["egg_weight"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight']))
print("RMSE", np.mean((y - model.predict(X)) ** 2))

RMSE 0.046493941375556846


def bootstrap_egg_weight_only(sample_df, n_iters=10000):
    """
    copied over for convenience
    """
    n = len(sample_df)
    estimates = []
    for i in range(n_iters):
        resample = sample_df.sample(n, replace=True)
        
        resample_y = resample["bird_weight"]
        resample_X = resample[["egg_weight"]] # just one feature + intercept
        model = lm.LinearRegression()
        model.fit(resample_X, resample_y)
        estimates.append( model.coef_[0])
    lower = np.percentile(estimates, 2.5, axis=0)
    upper = np.percentile(estimates, 97.5, axis=0)
    conf_interval = (lower, upper)
    return conf_interval

approx_conf_egg_weight_only = bootstrap_egg_weight_only(eggs)
approx_conf_egg_weight_only

(0.5997657176584066, 0.8201169601381006)

	egg_weight	egg_length	egg_breadth	bird_weight
0	7.4	28.80	21.84	5.2
1	7.7	29.04	22.45	5.4
2	7.9	29.36	22.48	5.6
3	7.5	30.10	21.71	5.3
4	8.3	30.17	22.75	5.9

	egg_weight	egg_length	egg_breadth	bird_weight
egg_weight	1.00	0.79	0.84	0.85
egg_length	0.79	1.00	0.40	0.68
egg_breadth	0.84	0.40	1.00	0.73
bird_weight	0.85	0.68	0.73	1.00

Inference for Regression coefficients, Colinearity¶

The Snowy Plover¶

The Data¶

Testing all the coefficients¶

Inspecting the Relationship between Features¶

Changing Our Modeling Features¶

	theta_hat
intercept	-4.605670
egg_weight	0.431229
egg_length	0.066570
egg_breadth	0.215914

	lower	upper
intercept	-15.450063	5.208329
theta_egg_weight	-0.255950	1.103805
theta_egg_length	-0.097812	0.210373
theta_egg_breadth	-0.258449	0.754633