import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.linear_model as lm

# big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
#plt.style.use('default') # revert style to default mpl
adjust_fontsize(size=20)
%matplotlib inline


eggs = pd.read_csv('snowy_plover.csv')
eggs.head()


eggs.shape

(44, 4)


y = eggs["bird_weight"]
X = eggs[["egg_weight", "egg_length", "egg_breadth"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight', 'egg_length', 'egg_breadth']))

print("RMSE", np.mean((y - model.predict(X)) ** 2))

RMSE 0.045470853802757734


def get_param1(model):
    # first feature
    return model.coef_[0]

def bootstrap_params(sample_df, get_param_fn=get_param1, n_iters=10000):
    """
    sample: the bootstrap population
    """
    n = len(sample_df)
    estimates = []
    for i in range(n_iters):
        # resample n times with replacement
        # i.e., get a new sample of same size
        # using df.sample(...)
        resample = sample_df.sample(n, replace=True)
        
        # train model with this bootstrap sample
        resample_y = resample["bird_weight"]
        resample_X = resample[["egg_weight", "egg_length", "egg_breadth"]]
        model = lm.LinearRegression()
        model.fit(resample_X, resample_y)
        
        # include the estimate
        estimate = get_param_fn(model)
        estimates.append(estimate)
    lower = np.percentile(estimates, 2.5, axis=0)
    upper = np.percentile(estimates, 97.5, axis=0)
    conf_interval = (lower, upper)
    return conf_interval


approx_conf1 = bootstrap_params(eggs, get_param1)
approx_conf1


def get_all_params(model):
    # all features
    return [model.intercept_] + list(model.coef_)

approx_confs = bootstrap_params(eggs, get_param_fn=get_all_params)
approx_confs


def simple_resample(n): 
    return np.random.randint(low=0, high=n, size=n)

def bootstrap(boot_pop, statistic, resample=simple_resample, replicates=10000):
    n = len(boot_pop)
    resample_estimates = [statistic(boot_pop[resample(n)])
                          for _ in range(replicates)]
    return np.array(resample_estimates)


def egg_thetas(data):
    X = data[:, :3]
    y = data[:, 3]
    
    model = lm.LinearRegression().fit(X, y)
    return model.coef_

egg_thetas = bootstrap(eggs.values, egg_thetas)


egg_ci = np.percentile(egg_thetas, [2.5, 97.5], axis=0)
pd.DataFrame(egg_ci.T,
             columns=['lower', 'upper'],
             index=['theta_egg_weight', 'theta_egg_length', 'theta_egg_breadth'])


px.scatter_matrix(eggs, width=450, height=450)


eggs.corr().round(2)


y = eggs["bird_weight"]
X = eggs[["egg_weight"]]
    
model = lm.LinearRegression(fit_intercept=True).fit(X, y)

display(pd.DataFrame([model.intercept_] + list(model.coef_),
             columns=['theta_hat'],
             index=['intercept', 'egg_weight']))
print("RMSE", np.mean((y - model.predict(X)) ** 2))


def bootstrap_egg_weight_only(sample_df, n_iters=10000):
    """
    copied over for convenience
    """
    n = len(sample_df)
    estimates = []
    for i in range(n_iters):
        resample = sample_df.sample(n, replace=True)
        
        resample_y = resample["bird_weight"]
        resample_X = resample[["egg_weight"]] # just one feature + intercept
        model = lm.LinearRegression()
        model.fit(resample_X, resample_y)
        estimates.append( model.coef_[0])
    lower = np.percentile(estimates, 2.5, axis=0)
    upper = np.percentile(estimates, 97.5, axis=0)
    conf_interval = (lower, upper)
    return conf_interval

approx_conf_egg_weight_only = bootstrap_egg_weight_only(eggs)
approx_conf_egg_weight_only

	egg_weight	egg_length	egg_breadth	bird_weight
0	7.4	28.80	21.84	5.2
1	7.7	29.04	22.45	5.4
2	7.9	29.36	22.48	5.6
3	7.5	30.10	21.71	5.3
4	8.3	30.17	22.75	5.9

Lecture 17 – Probability II: Estimators, Bias, and Variance¶

The Snowy Plover¶

The Data¶

Testing all the coefficients¶

Inspecting the Relationship between Features¶

Changing Our Modeling Features¶

	theta_hat
intercept	-4.605670
egg_weight	0.431229
egg_length	0.066570
egg_breadth	0.215914