import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import plotly.graph_objects as go
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
adjust_fontsize(size=20)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

np.random.seed(5)

# Load the fuel dataset, and drop any rows that have missing data
vehicle_data = sns.load_dataset('mpg').dropna()
vehicle_data  = vehicle_data.rename(columns={"horsepower": "hp"})

# Building a complex dataset of first order and second order features

poly_transform = PolynomialFeatures(degree=2, include_bias=False)
vehicle_data_with_squared_features = \
             pd.DataFrame(poly_transform.fit_transform(vehicle_data[["hp", "weight", "displacement"]]),
             columns = poly_transform.get_feature_names_out())

vehicle_data_with_squared_features

# Fitting a regression model with L2 regularization
from sklearn.linear_model import Ridge
ridge_model = Ridge(alpha=10**-5)
ridge_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])

Ridge(alpha=1e-05)

Ridge(alpha=1e-05)

ridge_model.coef_

array([-1.35872588e-01, -1.46864458e-04, -1.18230336e-01, -4.03590098e-04,
       -1.12862371e-05,  8.25179864e-04, -1.17645881e-06,  2.69757832e-05,
       -1.72888463e-04])

ridge_model = Ridge(alpha=10000)
ridge_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])

Ridge(alpha=10000)

Ridge(alpha=10000)

ridge_model.coef_

array([-5.56414449e-02, -7.93804083e-03, -8.22081425e-02, -6.18785466e-04,
       -2.55492157e-05,  9.47353944e-04,  7.58061062e-07,  1.07439477e-05,
       -1.64344898e-04])

# Fitting a regression model without regularization
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])

LinearRegression()

LinearRegression()

linear_model.coef_

array([-1.35872588e-01, -1.46864447e-04, -1.18230336e-01, -4.03590097e-04,
       -1.12862370e-05,  8.25179863e-04, -1.17645882e-06,  2.69757832e-05,
       -1.72888463e-04])

# Fitting a regression model with L1 regularization
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha = 10)
lasso_model.fit(vehicle_data_with_squared_features, vehicle_data["mpg"])

Lasso(alpha=10)

Lasso(alpha=10)

lasso_model.coef_

array([-0.00000000e+00, -1.88104942e-02, -0.00000000e+00, -1.19625308e-03,
        8.84657720e-06,  8.77253835e-04,  3.16759194e-06, -3.21738391e-05,
       -1.29386937e-05])

from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
rescaled_df = pd.DataFrame(ss.fit_transform(vehicle_data_with_squared_features),
                           columns = ss.get_feature_names_out())

rescaled_df

ridge_model = Ridge(alpha=10000)
ridge_model.fit(rescaled_df, vehicle_data["mpg"])
ridge_model.coef_

array([-0.1792743 , -0.19610513, -0.18648617, -0.1601219 , -0.18015125,
       -0.16858023, -0.18779478, -0.18176294, -0.17021841])

# Helper functions to plot and 
# Compute expectation, variance, standard deviation
def plot_dist(dist_df,
                      xname="x", pname="P(X = x)", varname="X",
                      save=False):
    """
    Plot a distribution from a distribution table.
    Single-variate.
    """
    plt.bar(dist_df[xname], dist_df[pname])
    plt.ylabel(pname)
    plt.xlabel(xname)
    plt.title(f"Distribution of ${varname}$")
    plt.xticks(sorted(dist_df[xname].unique()))
    if save:
        fig = plt.gcf()
        fig.patch.set_alpha(0.0)
        plt.savefig(f"dist{varname}.png", bbox_inches = 'tight');


def simulate_samples(df, xname="x", pname="P(X = x)", size=1):
    return np.random.choice(
                df[xname], # Draw from these choiecs
                size=size, # This many times
                p=df[pname]) # According to this distribution

def simulate_iid_df(dist_df, nvars, rows, varname="X"):
    """
    Make an (row x nvars) dataframe
    by calling simulate_samples for each of the nvars per row
    """
    sample_dict = {}
    for i in range(nvars):
        # Generate many datapoints 
        sample_dict[f"{varname}_{i+1}"] = \
            simulate_samples(dist_df, size=rows)
    return pd.DataFrame(sample_dict)


def plot_simulated_dist(df, colname, show_stats=True, save=False, **kwargs):
    """
    Plot a simulated population.
    """
    sns.histplot(data=df, x=colname, stat='probability', discrete=True, **kwargs)
    plt.xticks(sorted(df[colname].unique())) # if there are gaps)
    if show_stats:
        display(stats_df_multi(df, [colname]))
    if save:
        fig = plt.gcf()
        fig.patch.set_alpha(0.0)
        plt.savefig(f"sim{colname}.png", bbox_inches = 'tight');

def stats_df_multi(df, colnames):
    means = df[colnames].mean(axis=0)
    variances = df[colnames].var(axis=0)
    stdevs = df[colnames].std(axis=0)
    df_stats = pd.concat([means, variances, stdevs],axis=1).T
    df_stats['index_col'] = ["E[•]", "Var(•)", "SD(•)"]
    df_stats = df_stats.set_index('index_col', drop=True).rename_axis(None)
    return df_stats

def plot_simulated_dist_multi(df, colnames, show_stats=True):
    """
    If multiple columns provided, use separate plots.
    """
    ncols = 1
    nrows = len(colnames)
    plt.figure(figsize=(6, 2*nrows+2))
    
    for i, colname in enumerate(colnames):
        subplot_int = int(100*int(nrows) + 10*int(ncols) + int(i+1))
        plt.subplot(subplot_int)
        plot_simulated_dist(df, colname, show_stats=False)
    plt.tight_layout()
    if show_stats:
        display(stats_df_multi(df, colnames))

# Our random variable X
dist_df = pd.DataFrame({"x": [3, 4, 6, 8],
                        "P(X = x)": [0.1, 0.2, 0.4, 0.3]})
dist_df

plot_dist(dist_df, save=True)

# copied from above
# def simulate_samples(df, xname="x", pname="P(X = x)", size=1):
#     return np.random.choice(
#                 df[xname], # draw from these choiecs
#                 size=size, # this many times
#                 p=df[pname]) # according to this distribution

N = 80000
all_samples = simulate_samples(dist_df, size=N)
sim_df = pd.DataFrame({"X(s)": all_samples})
sim_df

plot_simulated_dist(sim_df, "X(s)")
plt.title("Simulated distribution of $X$")
plt.show()

# The tabular view of the above plot
sim_df.value_counts("X(s)").sort_values()/N

X(s)
3    0.100250
4    0.201212
8    0.299187
6    0.399350
dtype: float64

# Our random variable X
roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
                        "P(X = x)": np.ones(6)/6})
roll_df

plot_dist(roll_df)

roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
                        "P(X = x)": np.ones(6)/6})
roll_df

N = 80000
sim_rolls_df = simulate_iid_df(roll_df, nvars=2, rows=N)
sim_rolls_df

sim_rolls_df['Y'] = 2 * sim_rolls_df['X_1']
sim_rolls_df['Z'] = sim_rolls_df['X_1'] + sim_rolls_df['X_2']
sim_rolls_df

plot_simulated_dist(sim_rolls_df, "Y", save=True)

plot_simulated_dist(sim_rolls_df, "Z", save=True, color='gold')

stats_df_multi(sim_rolls_df, ["Y", "Z"])

# First construct probability distribution for a single fair coin
p = 0.5
coin_df = pd.DataFrame({"x": [0, 1], # [Tails, Heads]
                        "P(X = x)": [p, 1 - p]})
coin_df

# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)

# Construct Y_A from this sample
flips20_df["Y_A"] = 10*flips20_df["X_1"] + 10*flips20_df["X_2"]

print("Sample of size 1:")
display(flips20_df)
print("Y_A:", flips20_df.loc[0,"Y_A"])

Sample of size 1:

Y_A: 10

# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)

# Construct Y_B from this sample
flips20_df["Y_B"] = flips20_df.sum(axis=1) # sum all coins

display(flips20_df)
print("Y_B:", flips20_df.loc[0,"Y_B"])

Y_B: 9

# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)

# Construct Y_C from this sample
flips20_df["Y_C"] = 20*flips20_df["X_1"]

display(flips20_df[["X_1", "Y_C"]])
print("Y_C:", flips20_df.loc[0,"Y_C"])

Y_C: 0

# Simulate one big population of 20 fair flips, N = 80,000
N = 80000
df_coins = simulate_iid_df(coin_df, nvars=20, rows=N)
print(f"{N} simulated samples")


# construct Y_A, Y_B, Y_C from this population
df_coins["Y_A"] = 10*df_coins["X_1"] + 10*df_coins["X_2"]
df_coins["Y_B"] = df_coins.loc[:,"X_1":"X_20"].sum(axis=1)
df_coins["Y_C"] = 5 + 15*df_coins["X_1"]
plot_simulated_dist_multi(df_coins, ["Y_A", "Y_B", "Y_C"])

# adjust axes for nicer plotting
axs = plt.gcf().axes
axs[1].set_xlim(axs[0].get_xlim())    # Y_B
axs[1].set_xticks([0, 5, 10, 15, 20]) # Y_B
axs[2].set_xlim(axs[0].get_xlim())    # Y_C
plt.show()

80000 simulated samples

# Simulate one big population of 100 fair flips, N = 80,000
N = 80000
df_coins = simulate_iid_df(coin_df, nvars=50, rows=N)
df_coins["Y_100"] = df_coins.loc[:,"X_1":"X_50"].sum(axis=1)
plot_simulated_dist(df_coins, "Y_100")
plt.xticks([0, 10, 20, 30, 40, 50])
plt.show()

dist_df

# A population generated from the distribution
N = 100000
all_samples = simulate_samples(dist_df, size=N)
sim_pop_df = pd.DataFrame({"X(s)": all_samples})
sim_pop_df

n = 100      # Size of our sample
sample_df = (
             sim_pop_df.sample(n, replace=True)
             
             # Some reformatting below
             .reset_index(drop=True)
             .rename(columns={"X(s)": "X"})
            )
sample_df

sns.histplot(data=sample_df, x='X', stat='probability')
plt.xticks([3, 4, 6, 8])
plt.title(f"Sample (n = 100)")
plt.show()

print("Mean of Sample:", np.mean(sample_df['X']))

Mean of Sample: 5.87

plot_simulated_dist(sim_df, "X(s)")
plt.title("Population of $X$")
plt.show()

	hp	weight	displacement	hp^2	hp weight	hp displacement	weight^2	weight displacement	displacement^2
0	130.0	3504.0	307.0	16900.0	455520.0	39910.0	12278016.0	1075728.0	94249.0
1	165.0	3693.0	350.0	27225.0	609345.0	57750.0	13638249.0	1292550.0	122500.0
2	150.0	3436.0	318.0	22500.0	515400.0	47700.0	11806096.0	1092648.0	101124.0
3	150.0	3433.0	304.0	22500.0	514950.0	45600.0	11785489.0	1043632.0	92416.0
4	140.0	3449.0	302.0	19600.0	482860.0	42280.0	11895601.0	1041598.0	91204.0
...	...	...	...	...	...	...	...	...	...
387	86.0	2790.0	140.0	7396.0	239940.0	12040.0	7784100.0	390600.0	19600.0
388	52.0	2130.0	97.0	2704.0	110760.0	5044.0	4536900.0	206610.0	9409.0
389	84.0	2295.0	135.0	7056.0	192780.0	11340.0	5267025.0	309825.0	18225.0
390	79.0	2625.0	120.0	6241.0	207375.0	9480.0	6890625.0	315000.0	14400.0
391	82.0	2720.0	119.0	6724.0	223040.0	9758.0	7398400.0	323680.0	14161.0

	hp	weight	displacement	hp^2	hp weight	hp displacement	weight^2	weight displacement	displacement^2
0	0.664133	0.620540	1.077290	0.459979	0.528582	0.738743	0.492385	0.791843	0.926137
1	1.574594	0.843334	1.488732	1.513418	1.227955	1.562694	0.741147	1.206419	1.500790
2	1.184397	0.540382	1.182542	1.031336	0.800830	1.098529	0.406079	0.824195	1.065981
3	1.184397	0.536845	1.048584	1.031336	0.798784	1.001539	0.402311	0.730474	0.888852
4	0.924265	0.555706	1.029447	0.735454	0.652885	0.848203	0.422448	0.726585	0.864198
...	...	...	...	...	...	...	...	...	...
387	-0.480448	-0.221125	-0.520637	-0.509696	-0.451563	-0.548450	-0.329472	-0.518158	-0.592298
388	-1.364896	-0.999134	-0.932079	-0.988411	-1.038886	-0.871565	-0.923326	-0.869957	-0.799593
389	-0.532474	-0.804632	-0.568479	-0.544385	-0.665978	-0.580780	-0.789800	-0.672604	-0.620267
390	-0.662540	-0.415627	-0.712005	-0.627538	-0.599621	-0.666685	-0.492872	-0.662710	-0.698072
391	-0.584501	-0.303641	-0.721574	-0.578258	-0.528400	-0.653846	-0.400009	-0.646113	-0.702933

	X(s)
0	3
1	8
2	6
3	8
4	3
...	...
79995	6
79996	6
79997	6
79998	3
79999	8

	x	P(X = x)
0	1	0.166667
1	2	0.166667
2	3	0.166667
3	4	0.166667
4	5	0.166667
5	6	0.166667

	x	P(X = x)
0	1	0.166667
1	2	0.166667
2	3	0.166667
3	4	0.166667
4	5	0.166667
5	6	0.166667

Lecture 17 – Data 100, Fall 2023¶

Scaling the Data¶

A Random Variable $X$¶

Die Is the Singular; Dice Is the Plural¶

Sum of 2 Dice Rolls¶

Which would you pick?¶

Choice A:¶

Choice B:¶

Choice C:¶

From Population to Sample¶

	Y	Z
E[•]	7.019100	7.004613
Var(•)	11.644181	5.833214
SD(•)	3.412357	2.415205

	Y_A	Y_B	Y_C
E[•]	10.000375	10.007825	12.490625
Var(•)	50.359379	5.005526	56.250615
SD(•)	7.096434	2.237303	7.500041

	x	P(X = x)
0	3	0.1
1	4	0.2
2	6	0.4
3	8	0.3

	Y_100
E[•]	25.020512
Var(•)	12.470673
SD(•)	3.531384

	x	P(X = x)
0	3	0.1
1	4	0.2
2	6	0.4
3	8	0.3

	X(s)
0	6
1	3
2	8
3	4
4	4
...	...
99995	4
99996	6
99997	3
99998	6
99999	6