import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='darkgrid', font_scale = 1.5,
              rc={'figure.figsize':(7,5)})

rng = np.random.default_rng()


census = pd.read_csv("movie_census.csv")
census['Barbie'] = census['movie'] == 'Barbie'
census


actual_barbie = census["Barbie"].mean()
actual_barbie

0.5302792307692308


undergrads = census[(18 <= census['age']) & (census['age'] <= 22)].sample(10, replace=False)
undergrads["Barbie"].mean()

1.0


len(undergrads)

10


print("Percent of Berkeley:", len(undergrads)/len(census) * 100)

Percent of Berkeley: 0.0007692307692307692


elderly = census[census['age'] >= 65].sample(100)
elderly["Barbie"].mean()

0.39


len(elderly)

100


print("Percent of Berkeley:", len(elderly)/len(census) * 100)

Percent of Berkeley: 0.007692307692307693


votes_by_barbie = census.groupby(["age","wears_birkenstocks"]).agg("mean", numeric_only=True).reset_index()
votes_by_barbie


import plotly.express as px
px.scatter(votes_by_barbie, x = "age", y = "Barbie", 
           color = "wears_birkenstocks",
           title= "Preferences by Demographics")


## By default, replace = False
n = 800
random_sample = census.sample(n, replace = False)

random_sample["Barbie"].mean()

0.55375


actual_barbie

0.5302792307692308


n = 800
random_sample = census.sample(n, replace = False)

# Compute the sample average and the resulting relative error
sample_barbie = random_sample["Barbie"].mean()
err = abs(sample_barbie-actual_barbie)/actual_barbie

# We can print output with Markdown formatting too...
from IPython.display import Markdown
Markdown(f"**Actual** = {actual_barbie:.4f}, **Sample** = {sample_barbie:.4f}, "
         f"**Err** = {100*err:.2f}%.")


nrep = 1000   # number of simulations
n = 800       # size of our sample
poll_result = []
for i in range(0, nrep):
    random_sample = census.sample(n, replace = False)
    poll_result.append(random_sample["Barbie"].mean())


fig = px.histogram(poll_result, histnorm='probability density', nbins=50)
fig.add_vline(x=actual_barbie, line_width=3, line_dash="dash", line_color="orange")
fig.update_layout(showlegend=False)

# Add Kernel Density Estimate curve
from scipy import stats
from plotly import graph_objects as go
x = np.linspace(min(poll_result), max(poll_result), 100)
fig.add_trace(go.Scatter(
    x=x, 
    y=stats.gaussian_kde(poll_result)(x), # Library for KDE (auto selects bandwidth)
    mode='lines', line=dict(color='red', width=3)) # Formatting
    )


sns.histplot(poll_result, stat='density', kde=True);
plt.axvline(actual_barbie, color='orange', linestyle='dashed', linewidth=2)

<matplotlib.lines.Line2D at 0x7f1008817150>


poll_result = pd.Series(poll_result)
np.sum(poll_result > 0.5)/1000

0.961


np.random.multinomial(100, [0.60, 0.30, 0.10])

array([67, 25,  8])


np.random.multinomial(100, [0.60, 0.30, 0.10], size=20)

array([[64, 30,  6],
       [67, 26,  7],
       [66, 25,  9],
       [58, 28, 14],
       [57, 34,  9],
       [62, 30,  8],
       [71, 18, 11],
       [63, 26, 11],
       [60, 29, 11],
       [51, 36, 13],
       [56, 30, 14],
       [66, 25,  9],
       [58, 30, 12],
       [64, 28,  8],
       [57, 35,  8],
       [52, 39,  9],
       [60, 34,  6],
       [57, 29, 14],
       [63, 27, 10],
       [65, 25, 10]])

Sampling¶

Barbie v. Oppenheimer¶

Convenience sample: Undergrads in Prof. Gonzalez OH¶

Convenience sample: Elderly at a Campus Event¶

Check for bias¶

Simple Random Sample¶

Quantifying chance error¶

Simulating from a Multinomial Distribution¶

Marbles¶

	age	wears_birkenstocks	movie	Barbie
0	35	False	Barbie	True
1	42	True	Oppenheimer	False
2	55	False	Barbie	True
3	77	True	Oppenheimer	False
4	31	False	Barbie	True
...	...	...	...	...
1299995	62	True	Barbie	True
1299996	78	True	Oppenheimer	False
1299997	68	False	Oppenheimer	False
1299998	82	True	Oppenheimer	False
1299999	23	False	Barbie	True

	age	wears_birkenstocks	Barbie
0	18	False	0.819594
1	18	True	0.667001
2	19	False	0.812214
3	19	True	0.661252
4	20	False	0.805281
...	...	...	...
125	80	True	0.259731
126	81	False	0.394946
127	81	True	0.256759
128	82	False	0.398970
129	82	True	0.248060