## Code

```
import seaborn as sns
import pandas as pd
set(font_scale=1.5)
sns.import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import plotly.graph_objects as go
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
def adjust_fontsize(size=None):
= 8
SMALL_SIZE = 10
MEDIUM_SIZE = 12
BIGGER_SIZE if size != None:
= MEDIUM_SIZE = BIGGER_SIZE = size
SMALL_SIZE
'font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
plt.rc(
'fivethirtyeight')
plt.style.use("talk")
sns.set_context(
sns.set_theme()=20)
adjust_fontsize(size%matplotlib inline
import warnings
'ignore')
warnings.filterwarnings(
# Helper functions to plot and
# Compute expectation, variance, standard deviation
def plot_dist(dist_df,
="x", pname="P(X = x)", varname="X",
xname=False):
save"""
Plot a distribution from a distribution table.
Single-variate.
"""
plt.bar(dist_df[xname], dist_df[pname])
plt.ylabel(pname)
plt.xlabel(xname)f"Distribution of ${varname}$")
plt.title(sorted(dist_df[xname].unique()))
plt.xticks(if save:
= plt.gcf()
fig 0.0)
fig.patch.set_alpha(f"dist{varname}.png", bbox_inches = 'tight');
plt.savefig(
def simulate_samples(df, xname="x", pname="P(X = x)", size=1):
return np.random.choice(
# Draw from these choiecs
df[xname], =size, # This many times
size=df[pname]) # According to this distribution
p
def simulate_iid_df(dist_df, nvars, rows, varname="X"):
"""
Make an (row x nvars) dataframe
by calling simulate_samples for each of the nvars per row
"""
= {}
sample_dict for i in range(nvars):
# Generate many datapoints
f"{varname}_{i+1}"] = \
sample_dict[=rows)
simulate_samples(dist_df, sizereturn pd.DataFrame(sample_dict)
def plot_simulated_dist(df, colname, show_stats=True, save=False, **kwargs):
"""
Plot a simulated population.
"""
=df, x=colname, stat='probability', discrete=True, **kwargs)
sns.histplot(datasorted(df[colname].unique())) # if there are gaps)
plt.xticks(if show_stats:
display(stats_df_multi(df, [colname]))if save:
= plt.gcf()
fig 0.0)
fig.patch.set_alpha(f"sim{colname}.png", bbox_inches = 'tight');
plt.savefig(
def stats_df_multi(df, colnames):
= df[colnames].mean(axis=0)
means = df[colnames].var(axis=0)
variances = df[colnames].std(axis=0)
stdevs = pd.concat([means, variances, stdevs],axis=1).T
df_stats 'index_col'] = ["E[•]", "Var(•)", "SD(•)"]
df_stats[= df_stats.set_index('index_col', drop=True).rename_axis(None)
df_stats return df_stats
def plot_simulated_dist_multi(df, colnames, show_stats=True):
"""
If multiple columns provided, use separate plots.
"""
= 1
ncols = len(colnames)
nrows =(6, 2*nrows+2))
plt.figure(figsize
for i, colname in enumerate(colnames):
= int(100*int(nrows) + 10*int(ncols) + int(i+1))
subplot_int
plt.subplot(subplot_int)=False)
plot_simulated_dist(df, colname, show_stats
plt.tight_layout()if show_stats:
display(stats_df_multi(df, colnames))
```