import pandas as pd
import numpy as np
import plotly.express as px
# Our random variable X
dist_df = pd.DataFrame({"x": [3, 4, 6, 8],
"P(X = x)": [0.1, 0.2, 0.4, 0.3]})
dist_df
x | P(X = x) | |
---|---|---|
0 | 3 | 0.1 |
1 | 4 | 0.2 |
2 | 6 | 0.4 |
3 | 8 | 0.3 |
fig = px.bar(dist_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("distX.png", "png",scale=2)
fig
Let's use this probability distribution to generate a table of $X(s)$, i.e., random variable values for many many samples.
N = 80000
samples = np.random.choice(
dist_df["x"], # Draw from these choiecs
size=N, # This many times
p=dist_df["P(X = x)"]) # According to this distribution
sim_df = pd.DataFrame({"X(s)": samples})
sim_df
X(s) | |
---|---|
0 | 6 |
1 | 8 |
2 | 8 |
3 | 4 |
4 | 4 |
... | ... |
79995 | 6 |
79996 | 6 |
79997 | 4 |
79998 | 3 |
79999 | 6 |
80000 rows × 1 columns
Let's check how well this simulated sample matches our probability distribution!
fig = px.histogram(sim_df, x="X(s)", title="Empirical distribution of X",
histnorm="probability")
# fig.write_image("empirical_dist.png", "png",scale=2)
fig
print("Simulated E[X]:", sim_df['X(s)'].mean())
print("Simulated Var[X]:", sim_df['X(s)'].var())
Simulated E[X]: 5.903075 Simulated Var[X]: 2.8923416986462334
E_x = dist_df["x"] @ dist_df["P(X = x)"]
print("E[X]:",E_x)
E[X]: 5.9
Var_x = dist_df["x"]**2 @ dist_df["P(X = x)"] - E_x**2
print("Var[X]:", Var_x)
Var[X]: 2.8900000000000006
roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
"P(X = x)": np.ones(6)/6})
roll_df
x | P(X = x) | |
---|---|---|
0 | 1 | 0.166667 |
1 | 2 | 0.166667 |
2 | 3 | 0.166667 |
3 | 4 | 0.166667 |
4 | 5 | 0.166667 |
5 | 6 | 0.166667 |
fig = px.bar(roll_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("die.png", "png",scale=2)
fig
Let $X_1, X_2$ are the outcomes of two dice rolls. Note $X_1$ and $X_2$ are i.i.d. (independent and identically distributed).
Below I call a helper function simulate_iid_df
, which simulates an 80,000-row table of $X_1, X_2$ values. It uses np.random.choice(arr, size, p)
link where arr
is the array the values and p
is the probability associated with choosing each value. If you're interested in the implementation details, scroll up.
N = 80000
sim_rolls_df = pd.DataFrame({
"X_1": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"]),
"X_2": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"])
})
sim_rolls_df
X_1 | X_2 | |
---|---|---|
0 | 4 | 4 |
1 | 5 | 2 |
2 | 6 | 3 |
3 | 5 | 5 |
4 | 2 | 6 |
... | ... | ... |
79995 | 4 | 3 |
79996 | 6 | 5 |
79997 | 4 | 2 |
79998 | 5 | 6 |
79999 | 2 | 5 |
80000 rows × 2 columns
Define the following random variables, which are functions of $X_1$ and $X_2$:
We can use our simulated values of $X_1, X_2$ to create new columns $Y$ and $Z$:
sim_rolls_df['Y'] = 2 * sim_rolls_df['X_1']
sim_rolls_df['Z'] = sim_rolls_df['X_1'] + sim_rolls_df['X_2']
sim_rolls_df
X_1 | X_2 | Y | Z | |
---|---|---|---|---|
0 | 4 | 4 | 8 | 8 |
1 | 5 | 2 | 10 | 7 |
2 | 6 | 3 | 12 | 9 |
3 | 5 | 5 | 10 | 10 |
4 | 2 | 6 | 4 | 8 |
... | ... | ... | ... | ... |
79995 | 4 | 3 | 8 | 7 |
79996 | 6 | 5 | 12 | 11 |
79997 | 4 | 2 | 8 | 6 |
79998 | 5 | 6 | 10 | 11 |
79999 | 2 | 5 | 4 | 7 |
80000 rows × 4 columns
Now that we have simulated samples of $Y$ and $Z$, we can plot histograms to see their distributions!
px.histogram(sim_rolls_df[["Y", "Z"]].melt(), x="value", color="variable",
barmode="overlay", histnorm="probability",
title="Empirical Distributions")
pd.DataFrame([
sim_rolls_df[["Y", "Z"]].mean().rename("Mean"),
sim_rolls_df[["Y", "Z"]].var().rename("Var"),
np.sqrt(sim_rolls_df[["Y", "Z"]].var()).rename("SD")
])
Y | Z | |
---|---|---|
Mean | 6.995100 | 7.000600 |
Var | 11.661522 | 5.845173 |
SD | 3.414897 | 2.417679 |
First let's construct the probability distribution for a single coin. This will let us flip 20 IID coins later.
# First construct probability distribution for a single fair coin
p = 0.5
coin_df = pd.DataFrame({"x": [1, 0], # [Heads, Tails]
"P(X = x)": [p, 1 - p]})
coin_df
x | P(X = x) | |
---|---|---|
0 | 1 | 0.5 |
1 | 0 | 0.5 |
$\large Y_A = 10 X_1 + 10 X_2 $
N = 10000
np.random.rand(N,2) < p
array([[ True, False], [False, True], [False, False], ..., [False, True], [ True, True], [ True, False]])
sim_flips = pd.DataFrame(
{"Choice A": np.sum((np.random.rand(N,2) < p) * 10, axis=1)})
sim_flips
Choice A | |
---|---|
0 | 20 |
1 | 10 |
2 | 10 |
3 | 0 |
4 | 0 |
... | ... |
9995 | 10 |
9996 | 0 |
9997 | 20 |
9998 | 0 |
9999 | 0 |
10000 rows × 1 columns
$\large Y_B = \sum\limits_{i=1}^{20} X_i$
sim_flips["Choice B"] = np.sum((np.random.rand(N,20) < p), axis=1)
sim_flips
Choice A | Choice B | |
---|---|---|
0 | 20 | 11 |
1 | 10 | 9 |
2 | 10 | 6 |
3 | 0 | 13 |
4 | 0 | 11 |
... | ... | ... |
9995 | 10 | 10 |
9996 | 0 | 9 |
9997 | 20 | 11 |
9998 | 0 | 9 |
9999 | 0 | 9 |
10000 rows × 2 columns
$\large Y_C = 20 X_1$
sim_flips["Choice C"] = 20 * (np.random.rand(N,1) < p)
sim_flips
Choice A | Choice B | Choice C | |
---|---|---|---|
0 | 20 | 11 | 20 |
1 | 10 | 9 | 20 |
2 | 10 | 6 | 20 |
3 | 0 | 13 | 20 |
4 | 0 | 11 | 0 |
... | ... | ... | ... |
9995 | 10 | 10 | 0 |
9996 | 0 | 9 | 20 |
9997 | 20 | 11 | 20 |
9998 | 0 | 9 | 20 |
9999 | 0 | 9 | 0 |
10000 rows × 3 columns
px.histogram(sim_flips.melt(), x="value", facet_row="variable",
barmode="overlay", histnorm="probability",
title="Empirical Distributions",
width=600, height=600)
pd.DataFrame([
sim_flips.mean().rename("Mean"),
sim_flips.var().rename("Var"),
np.sqrt(sim_flips.var()).rename("SD")
])
Choice A | Choice B | Choice C | |
---|---|---|---|
Mean | 10.076000 | 9.995000 | 10.064000 |
Var | 50.319256 | 4.933868 | 100.005905 |
SD | 7.093607 | 2.221231 | 10.000295 |
dist_df
x | P(X = x) | |
---|---|---|
0 | 3 | 0.1 |
1 | 4 | 0.2 |
2 | 6 | 0.4 |
3 | 8 | 0.3 |
# A population generated from the distribution
N = 100000
all_samples = np.random.choice(dist_df["x"], N, p=dist_df["P(X = x)"])
sim_pop_df = pd.DataFrame({"X(s)": all_samples})
sim_pop_df
X(s) | |
---|---|
0 | 6 |
1 | 8 |
2 | 8 |
3 | 8 |
4 | 6 |
... | ... |
99995 | 6 |
99996 | 6 |
99997 | 4 |
99998 | 4 |
99999 | 8 |
100000 rows × 1 columns
Suppose we draw a sample of size 100 from this giant population.
We are performing Random Sampling with Replacement: df.sample(n, replace=True)
(link)
n = 100 # Size of our sample
sample_df = (
sim_pop_df.sample(n, replace=True)
# Some reformatting below
.reset_index(drop=True)
.rename(columns={"X(s)": "X"})
)
sample_df
X | |
---|---|
0 | 6 |
1 | 6 |
2 | 4 |
3 | 8 |
4 | 6 |
... | ... |
95 | 8 |
96 | 8 |
97 | 6 |
98 | 6 |
99 | 8 |
100 rows × 1 columns
Our sample distribution (n = 100):
px.histogram(sample_df, x="X", histnorm="probability", title="Sample (n = 100)")
Compare this to our original population (N = 80,000):
px.histogram(sim_df, x="X(s)", histnorm="probability", title="Population of X")
pd.DataFrame(
{"Sample": [sample_df["X"].mean(), sample_df["X"].var(), np.sqrt(sample_df["X"].var())],
"Population": [sim_df["X(s)"].mean(), sim_df["X(s)"].var(), np.sqrt(sim_df["X(s)"].var())]})
Sample | Population | |
---|---|---|
0 | 5.840000 | 5.903075 |
1 | 2.903434 | 2.892342 |
2 | 1.703947 | 1.700689 |