import pandas as pd
import numpy as np
import plotly.express as px


# Our random variable X
dist_df = pd.DataFrame({"x": [3, 4, 6, 8],
                        "P(X = x)": [0.1, 0.2, 0.4, 0.3]})
dist_df


fig = px.bar(dist_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("distX.png", "png",scale=2)
fig


N = 80000
samples = np.random.choice(
    dist_df["x"], # Draw from these choiecs
    size=N, # This many times
    p=dist_df["P(X = x)"]) # According to this distribution

sim_df = pd.DataFrame({"X(s)": samples})
sim_df


fig = px.histogram(sim_df, x="X(s)", title="Empirical distribution of X", 
                   histnorm="probability")
# fig.write_image("empirical_dist.png", "png",scale=2)
fig


print("Simulated E[X]:", sim_df['X(s)'].mean())
print("Simulated Var[X]:", sim_df['X(s)'].var())

Simulated E[X]: 5.903075
Simulated Var[X]: 2.8923416986462334


E_x = dist_df["x"] @ dist_df["P(X = x)"]
print("E[X]:",E_x)

E[X]: 5.9


Var_x = dist_df["x"]**2 @ dist_df["P(X = x)"] - E_x**2
print("Var[X]:", Var_x)

Var[X]: 2.8900000000000006


roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
                        "P(X = x)": np.ones(6)/6})
roll_df


fig = px.bar(roll_df, x="x", y="P(X = x)", title="Distribution of X")
# fig.write_image("die.png", "png",scale=2)
fig


N = 80000

sim_rolls_df = pd.DataFrame({
    "X_1": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"]),
    "X_2": np.random.choice(roll_df["x"], size = N, p = roll_df["P(X = x)"])
})

sim_rolls_df


sim_rolls_df['Y'] = 2 * sim_rolls_df['X_1']
sim_rolls_df['Z'] = sim_rolls_df['X_1'] + sim_rolls_df['X_2']
sim_rolls_df


px.histogram(sim_rolls_df[["Y", "Z"]].melt(), x="value", color="variable", 
             barmode="overlay", histnorm="probability",
             title="Empirical Distributions")


pd.DataFrame([
    sim_rolls_df[["Y", "Z"]].mean().rename("Mean"),
    sim_rolls_df[["Y", "Z"]].var().rename("Var"),
    np.sqrt(sim_rolls_df[["Y", "Z"]].var()).rename("SD")
])


# First construct probability distribution for a single fair coin
p = 0.5
coin_df = pd.DataFrame({"x": [1, 0], # [Heads, Tails]
                        "P(X = x)": [p, 1 - p]})
coin_df


N = 10000

np.random.rand(N,2) < p

array([[ True, False],
       [False,  True],
       [False, False],
       ...,
       [False,  True],
       [ True,  True],
       [ True, False]])


sim_flips = pd.DataFrame(
    {"Choice A": np.sum((np.random.rand(N,2) < p) * 10, axis=1)})
sim_flips


sim_flips["Choice B"] = np.sum((np.random.rand(N,20) < p), axis=1)
sim_flips


sim_flips["Choice C"] = 20  * (np.random.rand(N,1) < p) 
sim_flips


px.histogram(sim_flips.melt(), x="value", facet_row="variable", 
             barmode="overlay", histnorm="probability",
             title="Empirical Distributions",
             width=600, height=600)


pd.DataFrame([
    sim_flips.mean().rename("Mean"),
    sim_flips.var().rename("Var"),
    np.sqrt(sim_flips.var()).rename("SD")
])


dist_df


# A population generated from the distribution
N = 100000
all_samples = np.random.choice(dist_df["x"], N, p=dist_df["P(X = x)"])
sim_pop_df = pd.DataFrame({"X(s)": all_samples})
sim_pop_df


n = 100      # Size of our sample
sample_df = (
             sim_pop_df.sample(n, replace=True)
             # Some reformatting below
             .reset_index(drop=True)
             .rename(columns={"X(s)": "X"})
            )
sample_df


px.histogram(sample_df, x="X", histnorm="probability", title="Sample (n = 100)")


px.histogram(sim_df, x="X(s)", histnorm="probability", title="Population of X")


pd.DataFrame(
    {"Sample": [sample_df["X"].mean(), sample_df["X"].var(), np.sqrt(sample_df["X"].var())],
     "Population": [sim_df["X(s)"].mean(), sim_df["X(s)"].var(), np.sqrt(sim_df["X(s)"].var())]})

	X(s)
0	6
1	8
2	8
3	4
4	4
...	...
79995	6
79996	6
79997	4
79998	3
79999	6

	x	P(X = x)
0	1	0.166667
1	2	0.166667
2	3	0.166667
3	4	0.166667
4	5	0.166667
5	6	0.166667

	Y	Z
Mean	6.995100	7.000600
Var	11.661522	5.845173
SD	3.414897	2.417679

	Choice A	Choice B	Choice C
Mean	10.076000	9.995000	10.064000
Var	50.319256	4.933868	100.005905
SD	7.093607	2.221231	10.000295

	X(s)
0	6
1	8
2	8
3	8
4	6
...	...
99995	6
99996	6
99997	4
99998	4
99999	8

Lecture 17 – Data 100, Spring 2024¶

A Random Variable $X$¶

Sum of 2 Dice Rolls¶

Which would you pick?¶

Choice A:¶

Choice B:¶

Choice C:¶

From Population to Sample¶

	Choice A	Choice B
0	20	11
1	10	9
2	10	6
3	0	13
4	0	11
...	...	...
9995	10	10
9996	0	9
9997	20	11
9998	0	9
9999	0	9

	x	P(X = x)
0	3	0.1
1	4	0.2
2	6	0.4
3	8	0.3

	x	P(X = x)
0	3	0.1
1	4	0.2
2	6	0.4
3	8	0.3

	Sample	Population
0	5.840000	5.903075
1	2.903434	2.892342
2	1.703947	1.700689