Lecture 18 – Data 100, Fall 2023¶

Data 100, Fall 2023

Acknowledgments Page

In [1]:
import seaborn as sns
import pandas as pd
sns.set(font_scale=1.5)
import matplotlib.pyplot as plt
import numpy as np


def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size

    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
adjust_fontsize(size=20)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
In [2]:
# Helper functions to plot and 
# Compute expectation, variance, standard deviation
def plot_dist(dist_df,
                      xname="x", pname="P(X = x)", varname="X",
                      save=False):
    """
    Plot a distribution from a distribution table.
    Single-variate.
    """
    plt.bar(dist_df[xname], dist_df[pname])
    plt.ylabel(pname)
    plt.xlabel(xname)
    plt.title(f"Distribution of ${varname}$")
    plt.xticks(sorted(dist_df[xname].unique()))
    if save:
        fig = plt.gcf()
        fig.patch.set_alpha(0.0)
        plt.savefig(f"dist{varname}.png", bbox_inches = 'tight');


def simulate_samples(df, xname="x", pname="P(X = x)", size=1):
    return np.random.choice(
                df[xname], # Draw from these choiecs
                size=size, # This many times
                p=df[pname]) # According to this distribution

def simulate_iid_df(dist_df, nvars, rows, varname="X"):
    """
    Make an (row x nvars) dataframe
    by calling simulate_samples for each of the nvars per row
    """
    sample_dict = {}
    for i in range(nvars):
        # Generate many datapoints 
        sample_dict[f"{varname}_{i+1}"] = \
            simulate_samples(dist_df, size=rows)
    return pd.DataFrame(sample_dict)


def plot_simulated_dist(df, colname, show_stats=True, save=False, **kwargs):
    """
    Plot a simulated population.
    """
    sns.histplot(data=df, x=colname, stat='probability', discrete=True, **kwargs)
    plt.xticks(sorted(df[colname].unique())) # if there are gaps)
    if show_stats:
        display(stats_df_multi(df, [colname]))
    if save:
        fig = plt.gcf()
        fig.patch.set_alpha(0.0)
        plt.savefig(f"sim{colname}.png", bbox_inches = 'tight');

def stats_df_multi(df, colnames):
    means = df[colnames].mean(axis=0)
    variances = df[colnames].var(axis=0)
    stdevs = df[colnames].std(axis=0)
    df_stats = pd.concat([means, variances, stdevs],axis=1).T
    df_stats['index_col'] = ["E[•]", "Var(•)", "SD(•)"]
    df_stats = df_stats.set_index('index_col', drop=True).rename_axis(None)
    return df_stats

def plot_simulated_dist_multi(df, colnames, show_stats=True):
    """
    If multiple columns provided, use separate plots.
    """
    ncols = 1
    nrows = len(colnames)
    plt.figure(figsize=(6, 2*nrows+2))
    
    for i, colname in enumerate(colnames):
        subplot_int = int(100*int(nrows) + 10*int(ncols) + int(i+1))
        plt.subplot(subplot_int)
        plot_simulated_dist(df, colname, show_stats=False)
    plt.tight_layout()
    if show_stats:
        display(stats_df_multi(df, colnames))

A Random Variable $X$¶

Our probability distribution of $X$, shown as a table:

In [3]:
# Our random variable X
dist_df = pd.DataFrame({"x": [3, 4, 6, 8],
                        "P(X = x)": [0.1, 0.2, 0.4, 0.3]})
dist_df
Out[3]:
x P(X = x)
0 3 0.1
1 4 0.2
2 6 0.4
3 8 0.3
In [4]:
plot_dist(dist_df, save=False)
No description has been provided for this image

Let's use this probability distribution to generate a table of $X(s)$, i.e., random variable values for many many samples.

In [5]:
# copied from above
# def simulate_samples(df, xname="x", pname="P(X = x)", size=1):
#     return np.random.choice(
#                 df[xname], # draw from these choiecs
#                 size=size, # this many times
#                 p=df[pname]) # according to this distribution

N = 80000
all_samples = simulate_samples(dist_df, size=N)
sim_df = pd.DataFrame({"X(s)": all_samples})
sim_df
Out[5]:
X(s)
0 3
1 8
2 4
3 6
4 8
... ...
79995 6
79996 4
79997 8
79998 8
79999 3

80000 rows × 1 columns



Let's check how well this simulated sample matches our probability distribution!

In [6]:
plot_simulated_dist(sim_df, "X(s)")
plt.title("Simulated distribution of $X$")
plt.show()
X(s)
E[•] 5.893162
Var(•) 2.885584
SD(•) 1.698701
No description has been provided for this image
In [7]:
# The tabular view of the above plot
sim_df.value_counts("X(s)").sort_values()/N
Out[7]:
X(s)
3    0.100362
4    0.200650
8    0.297775
6    0.401213
dtype: float64




Die Is the Singular; Dice Is the Plural¶

Let X be the outcome of a single die roll. X is a random variable.

In [8]:
# Our random variable X
roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
                        "P(X = x)": np.ones(6)/6})
roll_df
Out[8]:
x P(X = x)
0 1 0.166667
1 2 0.166667
2 3 0.166667
3 4 0.166667
4 5 0.166667
5 6 0.166667
In [9]:
plot_dist(roll_df)
No description has been provided for this image




Sum of 2 Dice Rolls¶

Here's the distribution of a single die roll:

In [10]:
roll_df = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6],
                        "P(X = x)": np.ones(6)/6})
roll_df
Out[10]:
x P(X = x)
0 1 0.166667
1 2 0.166667
2 3 0.166667
3 4 0.166667
4 5 0.166667
5 6 0.166667

Let $X_1, X_2$ are the outcomes of two dice rolls. Note $X_1$ and $X_2$ are i.i.d. (independent and identically distributed).

Below I call a helper function simulate_iid_df, which simulates an 80,000-row table of $X_1, X_2$ values. It uses np.random.choice(arr, size, p) link where arr is the array the values and p is the probability associated with choosing each value. If you're interested in the implementation details, scroll up.

In [11]:
N = 80000
sim_rolls_df = simulate_iid_df(roll_df, nvars=2, rows=N)
sim_rolls_df
Out[11]:
X_1 X_2
0 4 6
1 1 3
2 2 5
3 6 1
4 6 3
... ... ...
79995 4 5
79996 3 4
79997 2 4
79998 2 2
79999 4 6

80000 rows × 2 columns

Define the following random variables, which are functions of $X_1$ and $X_2$:

  • $Y = X_1 + X_1 = 2 X_1$
  • $Z = X_1 + X_2$

We can use our simulated values of $X_1, X_2$ to create new columns $Y$ and $Z$:

In [12]:
sim_rolls_df['Y'] = 2 * sim_rolls_df['X_1']
sim_rolls_df['Z'] = sim_rolls_df['X_1'] + sim_rolls_df['X_2']
sim_rolls_df
Out[12]:
X_1 X_2 Y Z
0 4 6 8 10
1 1 3 2 4
2 2 5 4 7
3 6 1 12 7
4 6 3 12 9
... ... ... ... ...
79995 4 5 8 9
79996 3 4 6 7
79997 2 4 4 6
79998 2 2 4 4
79999 4 6 8 10

80000 rows × 4 columns

Now that we have simulated samples of $Y$ and $Z$, we can plot histograms to see their distributions!


Distribution of $Y$, which was twice the value of our first die roll:

In [13]:
plot_simulated_dist(sim_rolls_df, "Y", save=False)
Y
E[•] 6.998975
Var(•) 11.700545
SD(•) 3.420606
No description has been provided for this image

Distribution of $Z$, the sum of two IID dice rolls:

In [14]:
plot_simulated_dist(sim_rolls_df, "Z", save=False, color='gold')
Z
E[•] 6.991775
Var(•) 5.854556
SD(•) 2.419619
No description has been provided for this image

Let's compare the expectations and variances of these simulated distributions of $Y$ and $Z$.

  • We computed:
    • $\mathbb{E}[Y]$ as np.mean(sim_rolls_df['Y'])
    • $\text{Var}(Y)$ as np.var(sim_rolls_df['Y']
    • etc.
  • The larger your simulated rows $N$, the closer the simulated expectation will be to the true expectation.
  • Our approach is tedious--we have to simulate an entire population, then reduce it down to expectation/variance/standard deviation. There has to be a better way!
In [15]:
stats_df_multi(sim_rolls_df, ["Y", "Z"])
Out[15]:
Y Z
E[•] 6.998975 6.991775
Var(•) 11.700545 5.854556
SD(•) 3.420606 2.419619




Which would you pick?¶

  • $\large Y_A = 10 X_1 + 10 X_2 $
  • $\large Y_B = \sum\limits_{i=1}^{20} X_i$
  • $\large Y_C = 20 X_1$

First let's construct the probability distribution for a single coin. This will let us flip 20 IID coins later.

In [16]:
# First construct probability distribution for a single fair coin
p = 0.5
coin_df = pd.DataFrame({"x": [0, 1], # [Tails, Heads]
                        "P(X = x)": [p, 1 - p]})
coin_df
Out[16]:
x P(X = x)
0 0 0.5
1 1 0.5

Choice A:¶

$\large Y_A = 10 X_1 + 10 X_2 $

In [17]:
# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)

# Construct Y_A from this sample
flips20_df["Y_A"] = 10*flips20_df["X_1"] + 10*flips20_df["X_2"]

print("Sample of size 1:")
display(flips20_df)
print("Y_A:", flips20_df.loc[0,"Y_A"])
Sample of size 1:
X_1 X_2 X_3 X_4 X_5 X_6 X_7 X_8 X_9 X_10 ... X_12 X_13 X_14 X_15 X_16 X_17 X_18 X_19 X_20 Y_A
0 1 1 1 1 0 0 0 1 0 1 ... 1 0 0 0 0 1 1 0 1 20

1 rows × 21 columns

Y_A: 20

Choice B:¶

$\large Y_B = \sum\limits_{i=1}^{20} X_i$

In [18]:
# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)

# Construct Y_B from this sample
flips20_df["Y_B"] = flips20_df.sum(axis=1) # sum all coins

display(flips20_df)
print("Y_B:", flips20_df.loc[0,"Y_B"])
X_1 X_2 X_3 X_4 X_5 X_6 X_7 X_8 X_9 X_10 ... X_12 X_13 X_14 X_15 X_16 X_17 X_18 X_19 X_20 Y_B
0 1 1 1 0 0 1 1 0 1 1 ... 0 0 1 1 1 0 0 0 0 10

1 rows × 21 columns

Y_B: 10

Choice C:¶

$\large Y_C = 20 X_1$

In [19]:
# Flip 20 iid coins, each exactly once
flips20_df = simulate_iid_df(coin_df, nvars=20, rows=1)

# Construct Y_C from this sample
flips20_df["Y_C"] = 20*flips20_df["X_1"]

display(flips20_df[["X_1", "Y_C"]])
print("Y_C:", flips20_df.loc[0,"Y_C"])
X_1 Y_C
0 0 0
Y_C: 0


If you're curious as to what these distributions look like, I've simulated some populations:

In [20]:
# Simulate one big population of 20 fair flips, N = 80,000
N = 80000
df_coins = simulate_iid_df(coin_df, nvars=20, rows=N)
print(f"{N} simulated samples")


# construct Y_A, Y_B, Y_C from this population
df_coins["Y_A"] = 10*df_coins["X_1"] + 10*df_coins["X_2"]
df_coins["Y_B"] = df_coins.loc[:,"X_1":"X_20"].sum(axis=1)
df_coins["Y_C"] = 5 + 15*df_coins["X_1"]
plot_simulated_dist_multi(df_coins, ["Y_A", "Y_B", "Y_C"])

# adjust axes for nicer plotting
axs = plt.gcf().axes
axs[1].set_xlim(axs[0].get_xlim())    # Y_B
axs[1].set_xticks([0, 5, 10, 15, 20]) # Y_B
axs[2].set_xlim(axs[0].get_xlim())    # Y_C
plt.show()
80000 simulated samples
Y_A Y_B Y_C
E[•] 10.010500 9.993037 12.508250
Var(•) 49.928014 4.943926 56.250635
SD(•) 7.065976 2.223494 7.500042
No description has been provided for this image



If we flipped 100 coins, look how beautiful the Binomial distribution looks:

In [21]:
# Simulate one big population of 100 fair flips, N = 80,000
N = 80000
df_coins = simulate_iid_df(coin_df, nvars=50, rows=N)
df_coins["Y_100"] = df_coins.loc[:,"X_1":"X_50"].sum(axis=1)
plot_simulated_dist(df_coins, "Y_100")
plt.xticks([0, 10, 20, 30, 40, 50])
plt.show()
Y_100
E[•] 24.982575
Var(•) 12.461177
SD(•) 3.530039
No description has been provided for this image





From Population to Sample¶

Remember the population distribution we looked at earlier:

In [22]:
dist_df
Out[22]:
x P(X = x)
0 3 0.1
1 4 0.2
2 6 0.4
3 8 0.3
In [23]:
# A population generated from the distribution
N = 100000
all_samples = simulate_samples(dist_df, size=N)
sim_pop_df = pd.DataFrame({"X(s)": all_samples})
sim_pop_df
Out[23]:
X(s)
0 8
1 4
2 6
3 4
4 6
... ...
99995 8
99996 8
99997 4
99998 8
99999 6

100000 rows × 1 columns




Suppose we draw a sample of size 100 from this giant population.

We are performing Random Sampling with Replacement: df.sample(n, replace=True) (link)

In [24]:
n = 100      # Size of our sample
sample_df = (
             sim_pop_df.sample(n, replace=True)
             
             # Some reformatting below
             .reset_index(drop=True)
             .rename(columns={"X(s)": "X"})
            )
sample_df
Out[24]:
X
0 8
1 3
2 8
3 4
4 6
... ...
95 6
96 6
97 8
98 3
99 6

100 rows × 1 columns

Our sample distribution (n = 100):

In [25]:
sns.histplot(data=sample_df, x='X', stat='probability')
plt.xticks([3, 4, 6, 8])
plt.title(f"Sample (n = 100)")
plt.show()

print("Mean of Sample:", np.mean(sample_df['X']))
No description has been provided for this image
Mean of Sample: 6.02


Compare this to our original population (N = 80,000):

In [26]:
plot_simulated_dist(sim_df, "X(s)")
plt.title("Population of $X$")
plt.show()
X(s)
E[•] 5.893162
Var(•) 2.885584
SD(•) 1.698701
No description has been provided for this image