import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 300
A Random Variable $X$¶
This section just replicates figures in the lecture. We won't go through it together.
Our probability distribution of $X$, shown as a table:
# Our random variable X
dist_df = pd.DataFrame({"x": [3, 4, 6, 8],
"P(X = x)": [0.1, 0.2, 0.4, 0.3]})
dist_df
x | P(X = x) | |
---|---|---|
0 | 3 | 0.1 |
1 | 4 | 0.2 |
2 | 6 | 0.4 |
3 | 8 | 0.3 |
fig = px.bar(dist_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("distX.png", "png",scale=2)
fig
Let's use this probability distribution to generate a table of $X_i$'s (i.e., simulated draws from the probability distirbution above).
N = 80000
samples = np.random.choice(
dist_df["x"], # Draw from these choiecs
size=N, # This many times
p=dist_df["P(X = x)"]) # According to this distribution
sim_df = pd.DataFrame({"X": samples})
sim_df
X | |
---|---|
0 | 6 |
1 | 4 |
2 | 4 |
3 | 8 |
4 | 8 |
... | ... |
79995 | 8 |
79996 | 4 |
79997 | 4 |
79998 | 6 |
79999 | 3 |
80000 rows × 1 columns
Let's check how well this simulated sample matches our probability distribution!
fig = px.histogram(sim_df, x="X", title="Empirical distribution of X",
histnorm="probability")
# fig.write_image("empirical_dist.png", "png",scale=2)
fig
print("Simulated E[X]:", sim_df['X'].mean())
print("Simulated Var[X]:", sim_df['X'].var())
Simulated E[X]: 5.8871875 Simulated Var[X]: 2.894022015118938
E_x = dist_df["x"] @ dist_df["P(X = x)"]
print("E[X]:",E_x)
E[X]: 5.9
Var_x = dist_df["x"]**2 @ dist_df["P(X = x)"] - E_x**2
print("Var[X]:", Var_x)
Var[X]: 2.8900000000000006
🎲 🎲 Sum of 2 Dice Rolls¶
Here's where the lecture demo starts!
Here's the distribution of a single die roll:
roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
"P(X = x)": np.ones(6)/6})
roll_df
x | P(X = x) | |
---|---|---|
0 | 1 | 0.166667 |
1 | 2 | 0.166667 |
2 | 3 | 0.166667 |
3 | 4 | 0.166667 |
4 | 5 | 0.166667 |
5 | 6 | 0.166667 |
fig = px.bar(roll_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("die.png", "png",scale=2)
fig
Let $X_1, X_2$ are the outcomes of two dice rolls. Note $X_1$ and $X_2$ are i.i.d. (independent and identically distributed).
Below, we simulate an 80,000-row table of $X_1, X_2$ values.
The code uses np.random.choice(arr, size, p)
(documentation) where arr
is the array the values and p
is the probability associated with choosing each value.
N = 80000
# roll_df["x"] are the values 1 through 6
# roll_df["P(X=x)"] is a column of a constant 1/6, since all faces are
# equally likely
sim_rolls_df = pd.DataFrame({
"X_1": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"]),
"X_2": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"])
})
sim_rolls_df
X_1 | X_2 | |
---|---|---|
0 | 5 | 6 |
1 | 2 | 1 |
2 | 2 | 2 |
3 | 6 | 6 |
4 | 3 | 3 |
... | ... | ... |
79995 | 6 | 5 |
79996 | 2 | 1 |
79997 | 2 | 1 |
79998 | 6 | 1 |
79999 | 6 | 6 |
80000 rows × 2 columns
We can use our simulated values of $X_1, X_2$ to create new columns $2 X_1$ and $X_1 + X_2$:
sim_rolls_df['2 X_1'] = 2 * sim_rolls_df['X_1']
sim_rolls_df['X_1 + X_2'] = sim_rolls_df['X_1'] + sim_rolls_df['X_2']
sim_rolls_df
X_1 | X_2 | 2 X_1 | X_1 + X_2 | |
---|---|---|---|---|
0 | 5 | 6 | 10 | 11 |
1 | 2 | 1 | 4 | 3 |
2 | 2 | 2 | 4 | 4 |
3 | 6 | 6 | 12 | 12 |
4 | 3 | 3 | 6 | 6 |
... | ... | ... | ... | ... |
79995 | 6 | 5 | 12 | 11 |
79996 | 2 | 1 | 4 | 3 |
79997 | 2 | 1 | 4 | 3 |
79998 | 6 | 1 | 12 | 7 |
79999 | 6 | 6 | 12 | 12 |
80000 rows × 4 columns
Now that we have simulated samples of $2 X_1$ and $X_1 + X_2$, we can plot histograms to see their distributions:
plot_df = sim_rolls_df[["2 X_1", "X_1 + X_2"]].melt()
# Show 5 random rows of plot_df
display(plot_df.sample(5))
px.histogram(
plot_df,
x="value",
color="variable",
barmode="overlay",
histnorm="probability",
title="Empirical Distributions"
)
variable | value | |
---|---|---|
112749 | X_1 + X_2 | 6 |
93549 | X_1 + X_2 | 10 |
112339 | X_1 + X_2 | 10 |
133628 | X_1 + X_2 | 5 |
61287 | 2 X_1 | 4 |
# The empirical means are nearly identical, but the variance
# of the two rolls is lower than doubling the first roll.
pd.DataFrame([
sim_rolls_df[["2 X_1", "X_1 + X_2"]].mean().rename("Mean"),
sim_rolls_df[["2 X_1", "X_1 + X_2"]].var().rename("Var"),
np.sqrt(sim_rolls_df[["2 X_1", "X_1 + X_2"]].var()).rename("SD")
])
2 X_1 | X_1 + X_2 | |
---|---|---|
Mean | 6.976950 | 6.984587 |
Var | 11.705215 | 5.899749 |
SD | 3.421289 | 2.428940 |
Which would you pick?¶
This is a similar demonstration to the double dice roll above. We won't cover it together during the lecture.
- $\large Y_A = 10 X_1 + 10 X_2 $
- $\large Y_B = \sum\limits_{i=1}^{20} X_i$
- $\large Y_C = 20 X_1$
First let's construct the probability distribution for a single coin. This will let us flip 20 IID coins later.
# First construct probability distribution for a single fair coin
p = 0.5
coin_df = pd.DataFrame({"x": [1, 0], # [Heads, Tails]
"P(X = x)": [p, 1 - p]})
coin_df
x | P(X = x) | |
---|---|---|
0 | 1 | 0.5 |
1 | 0 | 0.5 |
Choice A:¶
$\large Y_A = 10 X_1 + 10 X_2 $
N = 10000
np.random.rand(N,2) < p
array([[False, False], [False, True], [ True, False], ..., [ True, True], [ True, False], [ True, False]])
sim_flips = pd.DataFrame(
{"Choice A": np.sum((np.random.rand(N,2) < p) * 10, axis=1)})
sim_flips
Choice A | |
---|---|
0 | 10 |
1 | 20 |
2 | 20 |
3 | 20 |
4 | 10 |
... | ... |
9995 | 10 |
9996 | 10 |
9997 | 0 |
9998 | 10 |
9999 | 20 |
10000 rows × 1 columns
Choice B:¶
$\large Y_B = \sum\limits_{i=1}^{20} X_i$
sim_flips["Choice B"] = np.sum((np.random.rand(N,20) < p), axis=1)
sim_flips
Choice A | Choice B | |
---|---|---|
0 | 10 | 13 |
1 | 20 | 10 |
2 | 20 | 12 |
3 | 20 | 12 |
4 | 10 | 8 |
... | ... | ... |
9995 | 10 | 13 |
9996 | 10 | 11 |
9997 | 0 | 7 |
9998 | 10 | 5 |
9999 | 20 | 12 |
10000 rows × 2 columns
Choice C:¶
$\large Y_C = 20 X_1$
sim_flips["Choice C"] = 20 * (np.random.rand(N,1) < p)
sim_flips
Choice A | Choice B | Choice C | |
---|---|---|---|
0 | 10 | 13 | 20 |
1 | 20 | 10 | 0 |
2 | 20 | 12 | 0 |
3 | 20 | 12 | 20 |
4 | 10 | 8 | 0 |
... | ... | ... | ... |
9995 | 10 | 13 | 0 |
9996 | 10 | 11 | 0 |
9997 | 0 | 7 | 0 |
9998 | 10 | 5 | 0 |
9999 | 20 | 12 | 0 |
10000 rows × 3 columns
If you're curious as to what these distributions look like, I've simulated some populations:
px.histogram(sim_flips.melt(), x="value", facet_row="variable",
barmode="overlay", histnorm="probability",
title="Empirical Distributions",
width=600, height=600)
pd.DataFrame([
sim_flips.mean().rename("Mean"),
sim_flips.var().rename("Var"),
np.sqrt(sim_flips.var()).rename("SD")
])
Choice A | Choice B | Choice C | |
---|---|---|---|
Mean | 10.052000 | 10.052400 | 10.146000 |
Var | 50.422338 | 4.984153 | 99.988683 |
SD | 7.100869 | 2.232522 | 9.999434 |
Visualizing the Binomial Distribution¶
This section replicates figures from the lecture slides. We won't cover it together.
The binomial distribution provides the probability of $y$ successes among $n$ trials, where each trial has $p$ probability of success:
$$ P(Y=y) = \binom{n}{y} p^y (1-p)^{n-y} $$
Equivalently, a binomial RV is a sum of $n$ Bernoulli RVs, each with probability $p$ of success.
We can visualize the binomial distribution for different combinations of $n$ and $p$, as a function of $y$:
# Make a grid plots the binomial distribution for p = 0.5 and n = [1, 2, 3, 10, 100]
# np.math.comb(n,k) gives the binomial coefficient for "n choose k"
def binom(n, k, p):
return np.math.comb(n, k) * p**k * (1-p)**(n-k)
n_values = [1, 2, 3, 10, 20, 100]
binom_df = pd.DataFrame({
"n": np.concatenate([[n] * (n+1) for n in n_values]),
"y": np.concatenate([np.arange(n+1) for n in n_values]),
"P(Y=y)": [binom(n, y, 0.5) for n in n_values for y in range(n+1)]
})
fig = px.bar(binom_df, x="y", y="P(Y=y)",
facet_col="n", facet_col_wrap=3,
facet_col_spacing=0.1, facet_row_spacing=0.15)
# Show axis ticks on all subplots
fig.update_yaxes(matches=None, showticklabels=True)
fig.update_xaxes(matches=None, showticklabels=True)
# Increase font size
fig.update_layout(font=dict(size=18))
# Increase size of figure
fig.update_layout(width=1000, height=600)
fig.show()
From Population to Sample¶
This section replicates findings from the lecture. We won't cover it together.
Remember the population distribution we looked at earlier:
dist_df
x | P(X = x) | |
---|---|---|
0 | 3 | 0.1 |
1 | 4 | 0.2 |
2 | 6 | 0.4 |
3 | 8 | 0.3 |
# A population generated from the distribution
N = 100000
all_samples = np.random.choice(dist_df["x"], N, p=dist_df["P(X = x)"])
sim_pop_df = pd.DataFrame({"X": all_samples})
sim_pop_df
X | |
---|---|
0 | 3 |
1 | 6 |
2 | 6 |
3 | 4 |
4 | 6 |
... | ... |
99995 | 6 |
99996 | 6 |
99997 | 6 |
99998 | 6 |
99999 | 4 |
100000 rows × 1 columns
Suppose we draw a sample of size 100 from this giant population.
We are performing Random Sampling with Replacement: df.sample(n, replace=True)
(link)
n = 100 # Size of our sample
sample_df = (
sim_pop_df.sample(n, replace=True)
# Some reformatting below
.reset_index(drop=True)
.rename(columns={"X": "X"})
)
sample_df
X | |
---|---|
0 | 8 |
1 | 8 |
2 | 6 |
3 | 6 |
4 | 6 |
... | ... |
95 | 4 |
96 | 8 |
97 | 4 |
98 | 8 |
99 | 6 |
100 rows × 1 columns
Our sample distribution (n = 100):
px.histogram(sample_df, x="X", histnorm="probability", title="Sample (n = 100)")
Compare this to our original population (N = 80,000):
px.histogram(sim_df, x="X", histnorm="probability", title="Population of X")
pd.DataFrame(
{"Sample": [sample_df["X"].mean(), sample_df["X"].var(), np.sqrt(sample_df["X"].var())],
"Population": [sim_df["X"].mean(), sim_df["X"].var(), np.sqrt(sim_df["X"].var())]})
Sample | Population | |
---|---|---|
0 | 6.090000 | 5.887187 |
1 | 2.991818 | 2.894022 |
2 | 1.729687 | 1.701183 |
# Three different distributions of X where E(X)=25
# First distribution
dist1_df = pd.DataFrame({"x": [20, 25, 30],
"P(X = x)": [0.2, 0.5, 0.2]})
# Second distribution -- really skewed
dist2_df = pd.DataFrame({"x": [0, 25, 50],
"P(X = x)": [0.49, 0.02, 0.49]})
# Third distribution -- skewed left
dist3_df = pd.DataFrame({"x": [0, 25, 150],
"P(X = x)": [0.5, 0.4, 0.1]})
fig, axs = plt.subplots(1, 3, figsize=(10, 2))
for i, dist_df in enumerate([dist1_df, dist2_df, dist3_df]):
sns.barplot(x="x", y="P(X = x)", data=dist_df, ax=axs[i], color="skyblue")
plt.tight_layout()
plt.show()