import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np
def adjust_fontsize(size=None):
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
if size != None:
SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
plt.rc('font', size=SMALL_SIZE) # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE) # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE) # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE) # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE) # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE) # fontsize of the figure title
plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
adjust_fontsize(size=20)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# Helper functions to plot and
# Compute expectation, variance, standard deviation
def plot_dist(dist_df,
xname="x", pname="P(X = x)", varname="X",
save=False):
"""
Plot a distribution from a distribution table.
Single-variate.
"""
plt.bar(dist_df[xname], dist_df[pname])
plt.ylabel(pname)
plt.xlabel(xname)
plt.title(f"Distribution of ${varname}$")
plt.xticks(sorted(dist_df[xname].unique()))
if save:
fig = plt.gcf()
fig.patch.set_alpha(0.0)
plt.savefig(f"dist{varname}.png", bbox_inches = 'tight');
def simulate_samples(df, xname="x", pname="P(X = x)", size=1):
return np.random.choice(
df[xname], # Draw from these choiecs
size=size, # This many times
p=df[pname]) # According to this distribution
def simulate_iid_df(dist_df, nvars, rows, varname="X"):
"""
Make an (row x nvars) dataframe
by calling simulate_samples for each of the nvars per row
"""
sample_dict = {}
for i in range(nvars):
# Generate many datapoints
sample_dict[f"{varname}_{i+1}"] = \
simulate_samples(dist_df, size=rows)
return pd.DataFrame(sample_dict)
def plot_simulated_dist(df, colname, show_stats=True, save=False, **kwargs):
"""
Plot a simulated population.
"""
sns.histplot(data=df, x=colname, stat='probability', discrete=True, **kwargs)
plt.xticks(sorted(df[colname].unique())) # if there are gaps)
if show_stats:
display(stats_df_multi(df, [colname]))
if save:
fig = plt.gcf()
fig.patch.set_alpha(0.0)
plt.savefig(f"sim{colname}.png", bbox_inches = 'tight');
def stats_df_multi(df, colnames):
means = df[colnames].mean(axis=0)
variances = df[colnames].var(axis=0)
stdevs = df[colnames].std(axis=0)
df_stats = pd.concat([means, variances, stdevs],axis=1).T
df_stats['index_col'] = ["E[•]", "Var(•)", "SD(•)"]
df_stats = df_stats.set_index('index_col', drop=True).rename_axis(None)
return df_stats
def plot_simulated_dist_multi(df, colnames, show_stats=True):
"""
If multiple columns provided, use separate plots.
"""
ncols = 1
nrows = len(colnames)
plt.figure(figsize=(6, 2*nrows+2))
for i, colname in enumerate(colnames):
subplot_int = int(100*int(nrows) + 10*int(ncols) + int(i+1))
plt.subplot(subplot_int)
plot_simulated_dist(df, colname, show_stats=False)
plt.tight_layout()
if show_stats:
display(stats_df_multi(df, colnames))
# Our random variable X
dist_df = pd.DataFrame({"x": [3, 4, 6, 8],
"P(X = x)": [0.1, 0.2, 0.4, 0.3]})
dist_df
x | P(X = x) | |
---|---|---|
0 | 3 | 0.1 |
1 | 4 | 0.2 |
2 | 6 | 0.4 |
3 | 8 | 0.3 |
plot_dist(dist_df, save=False)
Let's use this probability distribution to generate a table of $X(s)$, i.e., random variable values for many many samples.
# copied from above
# def simulate_samples(df, xname="x", pname="P(X = x)", size=1):
# return np.random.choice(
# df[xname], # draw from these choiecs
# size=size, # this many times
# p=df[pname]) # according to this distribution
N = 80000
all_samples = simulate_samples(dist_df, size=N)
sim_df = pd.DataFrame({"X(s)": all_samples})
sim_df
X(s) | |
---|---|
0 | 3 |
1 | 8 |
2 | 4 |
3 | 6 |
4 | 8 |
... | ... |
79995 | 6 |
79996 | 4 |
79997 | 8 |
79998 | 8 |
79999 | 3 |
80000 rows × 1 columns
Let's check how well this simulated sample matches our probability distribution!
plot_simulated_dist(sim_df, "X(s)")
plt.title("Simulated distribution of $X$")
plt.show()
X(s) | |
---|---|
E[•] | 5.893162 |
Var(•) | 2.885584 |
SD(•) | 1.698701 |
# The tabular view of the above plot
sim_df.value_counts("X(s)").sort_values()/N
X(s) 3 0.100362 4 0.200650 8 0.297775 6 0.401213 dtype: float64
Die Is the Singular; Dice Is the Plural¶
Let X be the outcome of a single die roll. X is a random variable.
# Our random variable X
roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
"P(X = x)": np.ones(6)/6})
roll_df
x | P(X = x) | |
---|---|---|
0 | 1 | 0.166667 |
1 | 2 | 0.166667 |
2 | 3 | 0.166667 |
3 | 4 | 0.166667 |
4 | 5 | 0.166667 |
5 | 6 | 0.166667 |
plot_dist(roll_df)
roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
"P(X = x)": np.ones(6)/6})
roll_df
x | P(X = x) | |
---|---|---|
0 | 1 | 0.166667 |
1 | 2 | 0.166667 |
2 | 3 | 0.166667 |
3 | 4 | 0.166667 |
4 | 5 | 0.166667 |
5 | 6 | 0.166667 |
Let $X_1, X_2$ are the outcomes of two dice rolls. Note $X_1$ and $X_2$ are i.i.d. (independent and identically distributed).
Below I call a helper function simulate_iid_df
, which simulates an 80,000-row table of $X_1, X_2$ values. It uses np.random.choice(arr, size, p)
link where arr
is the array the values and p
is the probability associated with choosing each value. If you're interested in the implementation details, scroll up.
N = 80000
sim_rolls_df = simulate_iid_df(roll_df, nvars=2, rows=N)
sim_rolls_df
X_1 | X_2 | |
---|---|---|
0 | 4 | 6 |
1 | 1 | 3 |
2 | 2 | 5 |
3 | 6 | 1 |
4 | 6 | 3 |
... | ... | ... |
79995 | 4 | 5 |
79996 | 3 | 4 |
79997 | 2 | 4 |
79998 | 2 | 2 |
79999 | 4 | 6 |
80000 rows × 2 columns
Define the following random variables, which are functions of $X_1$ and $X_2$:
- $Y = X_1 + X_1 = 2 X_1$
- $Z = X_1 + X_2$
We can use our simulated values of $X_1, X_2$ to create new columns $Y$ and $Z$:
sim_rolls_df['Y'] = 2 * sim_rolls_df['X_1']
sim_rolls_df['Z'] = sim_rolls_df['X_1'] + sim_rolls_df['X_2']
sim_rolls_df
X_1 | X_2 | Y | Z | |
---|---|---|---|---|
0 | 4 | 6 | 8 | 10 |
1 | 1 | 3 | 2 | 4 |
2 | 2 | 5 | 4 | 7 |
3 | 6 | 1 | 12 | 7 |
4 | 6 | 3 | 12 | 9 |
... | ... | ... | ... | ... |
79995 | 4 | 5 | 8 | 9 |
79996 | 3 | 4 | 6 | 7 |
79997 | 2 | 4 | 4 | 6 |
79998 | 2 | 2 | 4 | 4 |
79999 | 4 | 6 | 8 | 10 |
80000 rows × 4 columns
Now that we have simulated samples of $Y$ and $Z$, we can plot histograms to see their distributions!
Distribution of $Y$, which was twice the value of our first die roll:
plot_simulated_dist(sim_rolls_df, "Y", save=False)
Y | |
---|---|
E[•] | 6.998975 |
Var(•) | 11.700545 |
SD(•) | 3.420606 |
Distribution of $Z$, the sum of two IID dice rolls:
plot_simulated_dist(sim_rolls_df, "Z", save=False, color='gold')
Z | |
---|---|
E[•] | 6.991775 |
Var(•) | 5.854556 |
SD(•) | 2.419619 |
Let's compare the expectations and variances of these simulated distributions of $Y$ and $Z$.
- We computed:
- $\mathbb{E}[Y]$ as
np.mean(sim_rolls_df['Y'])
- $\text{Var}(Y)$ as
np.var(sim_rolls_df['Y']
- etc.
- $\mathbb{E}[Y]$ as
- The larger your simulated rows $N$, the closer the simulated expectation will be to the true expectation.
- Our approach is tedious--we have to simulate an entire population, then reduce it down to expectation/variance/standard deviation. There has to be a better way!
stats_df_multi(sim_rolls_df, ["Y", "Z"])
Y | Z | |
---|---|---|
E[•] | 6.998975 | 6.991775 |
Var(•) | 11.700545 | 5.854556 |
SD(•) | 3.420606 | 2.419619 |
Which would you pick?¶
- $\large Y_A = 10 X_1 + 10 X_2 $
- $\large Y_B = \sum\limits_{i=1}^{20} X_i$
- $\large Y_C = 20 X_1$
First let's construct the probability distribution for a single coin. This will let us flip 20 IID coins later.
# First construct probability distribution for a single fair coin
p = 0.5
coin_df = pd.DataFrame({"x": [0, 1], # [Tails, Heads]
"P(X = x)": [p, 1 - p]})
coin_df
x | P(X = x) | |
---|---|---|
0 | 0 | 0.5 |
1 | 1 | 0.5 |
Choice A:¶
$\large Y_A = 10 X_1 + 10 X_2 $
# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)
# Construct Y_A from this sample
flips20_df["Y_A"] = 10*flips20_df["X_1"] + 10*flips20_df["X_2"]
print("Sample of size 1:")
display(flips20_df)
print("Y_A:", flips20_df.loc[0,"Y_A"])
Sample of size 1:
X_1 | X_2 | X_3 | X_4 | X_5 | X_6 | X_7 | X_8 | X_9 | X_10 | ... | X_12 | X_13 | X_14 | X_15 | X_16 | X_17 | X_18 | X_19 | X_20 | Y_A | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 20 |
1 rows × 21 columns
Y_A: 20
Choice B:¶
$\large Y_B = \sum\limits_{i=1}^{20} X_i$
# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)
# Construct Y_B from this sample
flips20_df["Y_B"] = flips20_df.sum(axis=1) # sum all coins
display(flips20_df)
print("Y_B:", flips20_df.loc[0,"Y_B"])
X_1 | X_2 | X_3 | X_4 | X_5 | X_6 | X_7 | X_8 | X_9 | X_10 | ... | X_12 | X_13 | X_14 | X_15 | X_16 | X_17 | X_18 | X_19 | X_20 | Y_B | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 10 |
1 rows × 21 columns
Y_B: 10
Choice C:¶
$\large Y_C = 20 X_1$
# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)
# Construct Y_C from this sample
flips20_df["Y_C"] = 20*flips20_df["X_1"]
display(flips20_df[["X_1", "Y_C"]])
print("Y_C:", flips20_df.loc[0,"Y_C"])
X_1 | Y_C | |
---|---|---|
0 | 0 | 0 |
Y_C: 0
If you're curious as to what these distributions look like, I've simulated some populations:
# Simulate one big population of 20 fair flips, N = 80,000
N = 80000
df_coins = simulate_iid_df(coin_df, nvars=20, rows=N)
print(f"{N} simulated samples")
# construct Y_A, Y_B, Y_C from this population
df_coins["Y_A"] = 10*df_coins["X_1"] + 10*df_coins["X_2"]
df_coins["Y_B"] = df_coins.loc[:,"X_1":"X_20"].sum(axis=1)
df_coins["Y_C"] = 5 + 15*df_coins["X_1"]
plot_simulated_dist_multi(df_coins, ["Y_A", "Y_B", "Y_C"])
# adjust axes for nicer plotting
axs = plt.gcf().axes
axs[1].set_xlim(axs[0].get_xlim()) # Y_B
axs[1].set_xticks([0, 5, 10, 15, 20]) # Y_B
axs[2].set_xlim(axs[0].get_xlim()) # Y_C
plt.show()
80000 simulated samples
Y_A | Y_B | Y_C | |
---|---|---|---|
E[•] | 10.010500 | 9.993037 | 12.508250 |
Var(•) | 49.928014 | 4.943926 | 56.250635 |
SD(•) | 7.065976 | 2.223494 | 7.500042 |
If we flipped 100 coins, look how beautiful the Binomial distribution looks:
# Simulate one big population of 100 fair flips, N = 80,000
N = 80000
df_coins = simulate_iid_df(coin_df, nvars=50, rows=N)
df_coins["Y_100"] = df_coins.loc[:,"X_1":"X_50"].sum(axis=1)
plot_simulated_dist(df_coins, "Y_100")
plt.xticks([0, 10, 20, 30, 40, 50])
plt.show()
Y_100 | |
---|---|
E[•] | 24.982575 |
Var(•) | 12.461177 |
SD(•) | 3.530039 |
dist_df
x | P(X = x) | |
---|---|---|
0 | 3 | 0.1 |
1 | 4 | 0.2 |
2 | 6 | 0.4 |
3 | 8 | 0.3 |
# A population generated from the distribution
N = 100000
all_samples = simulate_samples(dist_df, size=N)
sim_pop_df = pd.DataFrame({"X(s)": all_samples})
sim_pop_df
X(s) | |
---|---|
0 | 8 |
1 | 4 |
2 | 6 |
3 | 4 |
4 | 6 |
... | ... |
99995 | 8 |
99996 | 8 |
99997 | 4 |
99998 | 8 |
99999 | 6 |
100000 rows × 1 columns
Suppose we draw a sample of size 100 from this giant population.
We are performing Random Sampling with Replacement: df.sample(n, replace=True)
(link)
n = 100 # Size of our sample
sample_df = (
sim_pop_df.sample(n, replace=True)
# Some reformatting below
.reset_index(drop=True)
.rename(columns={"X(s)": "X"})
)
sample_df
X | |
---|---|
0 | 8 |
1 | 3 |
2 | 8 |
3 | 4 |
4 | 6 |
... | ... |
95 | 6 |
96 | 6 |
97 | 8 |
98 | 3 |
99 | 6 |
100 rows × 1 columns
Our sample distribution (n = 100):
sns.histplot(data=sample_df, x='X', stat='probability')
plt.xticks([3, 4, 6, 8])
plt.title(f"Sample (n = 100)")
plt.show()
print("Mean of Sample:", np.mean(sample_df['X']))
Mean of Sample: 6.02
Compare this to our original population (N = 80,000):
plot_simulated_dist(sim_df, "X(s)")
plt.title("Population of $X$")
plt.show()
X(s) | |
---|---|
E[•] | 5.893162 |
Var(•) | 2.885584 |
SD(•) | 1.698701 |