import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='darkgrid', font_scale = 1.5,
              rc={'figure.figsize':(7,5)})

rng = np.random.default_rng()


movie = pd.read_csv("movie.csv")

# create a 1/0 int that indicates Barbie vote
movie['barbie'] = (movie['movie'] == 'Barbie').astype(int)
movie


actual_barbie = np.mean(movie["barbie"])
actual_barbie

0.5302792307692308


convenience_sample = movie[movie['age'] >= 65]
np.mean(convenience_sample["barbie"])

0.3744755089093924


len(convenience_sample)

359396


len(convenience_sample)/len(movie)

0.27645846153846154


votes_by_barbie = movie.groupby(["age","is_male"]).agg("mean", numeric_only=True).reset_index()
votes_by_barbie


# A common matplotlib/seaborn pattern: create the figure and axes object, pass ax
# to seaborn for drawing into, and later fine-tune the figure via ax.
fig, ax = plt.subplots();

red_blue = ["#bf1518", "#397eb7"]
with sns.color_palette(red_blue):
    sns.pointplot(data=votes_by_barbie, x = "age", y = "barbie", hue = "is_male", ax=ax)

new_ticks = [i.get_text() for i in ax.get_xticklabels()]
ax.set_xticks(range(0, len(new_ticks), 10), new_ticks[::10])
ax.set_title("Preferences by Demographics");


## By default, replace = False
n = len(convenience_sample)
random_sample = movie.sample(n, replace = False)

np.mean(random_sample["barbie"])

0.5302396242584781


actual_barbie

0.5302792307692308


n = 800
random_sample = movie.sample(n, replace = False)

# Compute the sample average and the resulting relative error
sample_barbie = np.mean(random_sample["barbie"])
err = abs(sample_barbie-actual_barbie)/actual_barbie

# We can print output with Markdown formatting too...
from IPython.display import Markdown
Markdown(f"**Actual** = {actual_barbie:.4f}, **Sample** = {sample_barbie:.4f}, "
         f"**Err** = {100*err:.2f}%.")


nrep = 1000   # number of simulations
n = 800       # size of our sample
poll_result = []
for i in range(0, nrep):
    random_sample = movie.sample(n, replace = False)
    poll_result.append(np.mean(random_sample["barbie"]))


fig, ax = plt.subplots()
sns.histplot(poll_result, stat='density', ax=ax)
ax.axvline(actual_barbie, color="orange", lw=4);


poll_result = pd.Series(poll_result)
np.sum(poll_result > 0.5)/1000

0.956


sns.histplot(poll_result, stat='density', kde=True);


np.random.multinomial(100, [0.60, 0.30, 0.10])

array([59, 32,  9])


np.random.multinomial(100, [0.60, 0.30, 0.10], size=20)

array([[66, 25,  9],
       [62, 29,  9],
       [52, 34, 14],
       [69, 19, 12],
       [60, 30, 10],
       [60, 26, 14],
       [63, 27, 10],
       [67, 23, 10],
       [56, 33, 11],
       [55, 38,  7],
       [57, 33, 10],
       [55, 36,  9],
       [54, 38,  8],
       [64, 26, 10],
       [57, 29, 14],
       [61, 28, 11],
       [68, 24,  8],
       [52, 32, 16],
       [51, 37, 12],
       [57, 35,  8]])

Sampling¶

Barbie v. Oppenheimer¶

Convenience sample: retirees¶

Check for bias¶

Simple Random Sample¶

Quantifying chance error¶

Simulating from a Multinomial Distribution¶

Marbles¶

	age	is_male	movie	barbie
0	35	False	Barbie	1
1	42	True	Oppenheimer	0
2	55	False	Barbie	1
3	77	True	Oppenheimer	0
4	31	False	Barbie	1
...	...	...	...	...
1299995	62	True	Barbie	1
1299996	78	True	Oppenheimer	0
1299997	68	False	Oppenheimer	0
1299998	82	True	Oppenheimer	0
1299999	23	False	Barbie	1

	age	is_male	barbie
0	18	False	0.819594
1	18	True	0.667001
2	19	False	0.812214
3	19	True	0.661252
4	20	False	0.805281
...	...	...	...
125	80	True	0.259731
126	81	False	0.394946
127	81	True	0.256759
128	82	False	0.398970
129	82	True	0.248060