import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

plt.rcParams['figure.dpi'] = 300

# Our random variable X
dist_df = pd.DataFrame({"x": [3, 4, 6, 8],
                        "P(X = x)": [0.1, 0.2, 0.4, 0.3]})
dist_df

fig = px.bar(dist_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("distX.png", "png",scale=2)
fig

N = 80000
samples = np.random.choice(
    dist_df["x"], # Draw from these choiecs
    size=N, # This many times
    p=dist_df["P(X = x)"]) # According to this distribution

sim_df = pd.DataFrame({"X": samples})
sim_df

fig = px.histogram(sim_df, x="X", title="Empirical distribution of X", 
                   histnorm="probability")
# fig.write_image("empirical_dist.png", "png",scale=2)
fig

print("Simulated E[X]:", sim_df['X'].mean())
print("Simulated Var[X]:", sim_df['X'].var())

Simulated E[X]: 5.8871875
Simulated Var[X]: 2.894022015118938

E_x = dist_df["x"] @ dist_df["P(X = x)"]
print("E[X]:",E_x)

E[X]: 5.9

Var_x = dist_df["x"]**2 @ dist_df["P(X = x)"] - E_x**2
print("Var[X]:", Var_x)

Var[X]: 2.8900000000000006

roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
                        "P(X = x)": np.ones(6)/6})
roll_df

fig = px.bar(roll_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("die.png", "png",scale=2)
fig

N = 80000

# roll_df["x"] are the values 1 through 6
# roll_df["P(X=x)"] is a column of a constant 1/6, since all faces are 
# equally likely
sim_rolls_df = pd.DataFrame({
    "X_1": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"]),
    "X_2": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"])
})

sim_rolls_df

sim_rolls_df['2 X_1'] = 2 * sim_rolls_df['X_1']
sim_rolls_df['X_1 + X_2'] = sim_rolls_df['X_1'] + sim_rolls_df['X_2']
sim_rolls_df

plot_df = sim_rolls_df[["2 X_1", "X_1 + X_2"]].melt()

# Show 5 random rows of plot_df
display(plot_df.sample(5))

px.histogram(
  plot_df,
  x="value", 
  color="variable", 
  barmode="overlay", 
  histnorm="probability",
  title="Empirical Distributions"
)

# The empirical means are nearly identical, but the variance
# of the two rolls is lower than doubling the first roll.
pd.DataFrame([
    sim_rolls_df[["2 X_1", "X_1 + X_2"]].mean().rename("Mean"),
    sim_rolls_df[["2 X_1", "X_1 + X_2"]].var().rename("Var"),
    np.sqrt(sim_rolls_df[["2 X_1", "X_1 + X_2"]].var()).rename("SD")
])

# First construct probability distribution for a single fair coin
p = 0.5
coin_df = pd.DataFrame({"x": [1, 0], # [Heads, Tails]
                        "P(X = x)": [p, 1 - p]})
coin_df

N = 10000

np.random.rand(N,2) < p

array([[False, False],
       [False,  True],
       [ True, False],
       ...,
       [ True,  True],
       [ True, False],
       [ True, False]])

sim_flips = pd.DataFrame(
    {"Choice A": np.sum((np.random.rand(N,2) < p) * 10, axis=1)})
sim_flips

sim_flips["Choice B"] = np.sum((np.random.rand(N,20) < p), axis=1)
sim_flips

sim_flips["Choice C"] = 20  * (np.random.rand(N,1) < p) 
sim_flips

px.histogram(sim_flips.melt(), x="value", facet_row="variable", 
             barmode="overlay", histnorm="probability",
             title="Empirical Distributions",
             width=600, height=600)

pd.DataFrame([
    sim_flips.mean().rename("Mean"),
    sim_flips.var().rename("Var"),
    np.sqrt(sim_flips.var()).rename("SD")
])

# Make a grid plots the binomial distribution for p = 0.5 and n = [1, 2, 3, 10, 100]

# np.math.comb(n,k) gives the binomial coefficient for "n choose k"
def binom(n, k, p): 
  return np.math.comb(n, k) * p**k * (1-p)**(n-k)

n_values = [1, 2, 3, 10, 20, 100]

binom_df = pd.DataFrame({
    "n": np.concatenate([[n] * (n+1) for n in n_values]),
    "y": np.concatenate([np.arange(n+1) for n in n_values]),
    "P(Y=y)": [binom(n, y, 0.5) for n in n_values for y in range(n+1)]
})

fig = px.bar(binom_df, x="y", y="P(Y=y)", 
             facet_col="n", facet_col_wrap=3,
             facet_col_spacing=0.1, facet_row_spacing=0.15)

# Show axis ticks on all subplots
fig.update_yaxes(matches=None, showticklabels=True)
fig.update_xaxes(matches=None, showticklabels=True)

# Increase font size
fig.update_layout(font=dict(size=18))

# Increase size of figure
fig.update_layout(width=1000, height=600)

fig.show()

dist_df

# A population generated from the distribution
N = 100000
all_samples = np.random.choice(dist_df["x"], N, p=dist_df["P(X = x)"])
sim_pop_df = pd.DataFrame({"X": all_samples})
sim_pop_df

n = 100      # Size of our sample
sample_df = (
             sim_pop_df.sample(n, replace=True)
             # Some reformatting below
             .reset_index(drop=True)
             .rename(columns={"X": "X"})
            )
sample_df

px.histogram(sample_df, x="X", histnorm="probability", title="Sample (n = 100)")

px.histogram(sim_df, x="X", histnorm="probability", title="Population of X")

pd.DataFrame(
    {"Sample": [sample_df["X"].mean(), sample_df["X"].var(), np.sqrt(sample_df["X"].var())],
     "Population": [sim_df["X"].mean(), sim_df["X"].var(), np.sqrt(sim_df["X"].var())]})

# Three different distributions of X where E(X)=25

# First distribution
dist1_df = pd.DataFrame({"x": [20, 25, 30],
                         "P(X = x)": [0.2, 0.5, 0.2]})
# Second distribution -- really skewed
dist2_df = pd.DataFrame({"x": [0, 25, 50],
                         "P(X = x)": [0.49, 0.02, 0.49]})
# Third distribution -- skewed left
dist3_df = pd.DataFrame({"x": [0, 25, 150],
                         "P(X = x)": [0.5, 0.4, 0.1]})

fig, axs = plt.subplots(1, 3, figsize=(10, 2))
for i, dist_df in enumerate([dist1_df, dist2_df, dist3_df]):
    sns.barplot(x="x", y="P(X = x)", data=dist_df, ax=axs[i], color="skyblue")

plt.tight_layout()

plt.show()

	X
0	6
1	4
2	4
3	8
4	8
...	...
79995	8
79996	4
79997	4
79998	6
79999	3

	x	P(X = x)
0	1	0.166667
1	2	0.166667
2	3	0.166667
3	4	0.166667
4	5	0.166667
5	6	0.166667

	2 X_1	X_1 + X_2
Mean	6.976950	6.984587
Var	11.705215	5.899749
SD	3.421289	2.428940

	Choice A	Choice B	Choice C
Mean	10.052000	10.052400	10.146000
Var	50.422338	4.984153	99.988683
SD	7.100869	2.232522	9.999434

	X
0	3
1	6
2	6
3	4
4	6
...	...
99995	6
99996	6
99997	6
99998	6
99999	4

🎲 Lecture 17 – Data 100, Spring 2025¶

A Random Variable $X$¶

🎲 🎲 Sum of 2 Dice Rolls¶

Which would you pick?¶

Choice A:¶

Choice B:¶

Choice C:¶

Visualizing the Binomial Distribution¶

From Population to Sample¶

	variable	value
112749	X_1 + X_2	6
93549	X_1 + X_2	10
112339	X_1 + X_2	10
133628	X_1 + X_2	5
61287	2 X_1	4

	Choice A	Choice B
0	10	13
1	20	10
2	20	12
3	20	12
4	10	8
...	...	...
9995	10	13
9996	10	11
9997	0	7
9998	10	5
9999	20	12

	x	P(X = x)
0	3	0.1
1	4	0.2
2	6	0.4
3	8	0.3

	x	P(X = x)
0	3	0.1
1	4	0.2
2	6	0.4
3	8	0.3

	Sample	Population
0	6.090000	5.887187
1	2.991818	2.894022
2	1.729687	1.701183