import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='darkgrid', font_scale = 1.5,
              rc={'figure.figsize':(7,5)})

rng = np.random.default_rng()


movie = pd.read_csv("movie.csv")

# create a 1/0 int that indicates Barbie vote
movie['barbie'] = (movie['movie'] == 'Barbie').astype(int)
movie


actual_barbie = np.mean(movie["barbie"])
actual_barbie

0.5302792307692308


convenience_sample = movie[movie['age'] >= 65]
np.mean(convenience_sample["barbie"])

0.3744755089093924


len(convenience_sample)

359396


len(convenience_sample)/len(movie)

0.27645846153846154


votes_by_barbie = movie.groupby(["age","is_male"]).agg("mean").reset_index()
votes_by_barbie


import matplotlib.ticker as ticker
fig = plt.figure();
red_blue = ["#bf1518", "#397eb7"]
with sns.color_palette(sns.color_palette(red_blue)):
    ax = sns.pointplot(data=votes_by_barbie, x = "age", y = "barbie", hue = "is_male")

ax.set_title("Preferences by Demographics")
fig.canvas.draw()
new_ticks = [i.get_text() for i in ax.get_xticklabels()];
plt.xticks(range(0, len(new_ticks), 10), new_ticks[::10]);


## By default, replace = False
n = len(convenience_sample)
random_sample = movie.sample(n, replace = False)

np.mean(random_sample["barbie"])

0.529502276040913


actual_barbie

0.5302792307692308


n = 800
random_sample = movie.sample(n, replace = False)
np.mean(random_sample["barbie"])

0.5225


nrep = 1000   # number of simulations
n = 800       # size of our sample
poll_result = []
for i in range(0, nrep):
    random_sample = movie.sample(n, replace = False)
    poll_result.append(np.mean(random_sample["barbie"]))


sns.histplot(poll_result, stat='density');


poll_result = pd.Series(poll_result)
np.sum(poll_result > 0.5)/1000

0.943


sns.histplot(poll_result, stat='density', kde=True);


np.random.multinomial(100, [0.60, 0.30, 0.10])

array([63, 27, 10])


np.random.multinomial(100, [0.60, 0.30, 0.10], size=20)

array([[59, 31, 10],
       [66, 28,  6],
       [57, 36,  7],
       [64, 31,  5],
       [69, 23,  8],
       [57, 37,  6],
       [58, 32, 10],
       [62, 29,  9],
       [59, 32,  9],
       [50, 33, 17],
       [73, 18,  9],
       [54, 35, 11],
       [63, 27, 10],
       [66, 25,  9],
       [59, 31, 10],
       [57, 36,  7],
       [60, 28, 12],
       [57, 34,  9],
       [63, 28,  9],
       [51, 36, 13]])

	age	is_male	barbie
0	18	False	0.819594
1	18	True	0.667001
2	19	False	0.812214
3	19	True	0.661252
4	20	False	0.805281
...	...	...	...
125	80	True	0.259731
126	81	False	0.394946
127	81	True	0.256759
128	82	False	0.398970
129	82	True	0.248060

Sampling¶

Barbie v. Oppenheimer¶

Convenience sample: retirees¶

Check for bias¶

Simple Random Sample¶

Quantifying chance error¶

Simulating from a Multinomial Distribution¶

Marbles¶