import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='darkgrid', font_scale = 1.5,
              rc={'figure.figsize':(7,5)})

rng = np.random.default_rng()

census = pd.read_csv("movie_census.csv")
census['Barbie'] = census['movie'] == 'Barbie'
census

actual_barbie = census["Barbie"].mean()
actual_barbie

np.float64(0.5302792307692308)

undergrads = census[(18 <= census['age']) & (census['age'] <= 22)].sample(10, replace=False)
undergrads["Barbie"].mean()

np.float64(0.6)

len(undergrads)

10

print("Percent of Berkeley:", len(undergrads)/len(census) * 100)

Percent of Berkeley: 0.0007692307692307692

elderly = census[census['age'] >= 65].sample(100)
elderly["Barbie"].mean()

np.float64(0.29)

len(elderly)

100

print("Percent of Berkeley:", len(elderly)/len(census) * 100)

Percent of Berkeley: 0.007692307692307693

votes_by_barbie = (
    census
    .groupby(["age","wears_birkenstocks"])
        .agg("mean", numeric_only=True)
    .reset_index())

votes_by_barbie

import plotly.express as px
px.scatter(votes_by_barbie, x = "age", y = "Barbie", 
           color = "wears_birkenstocks",
           title= "Preferences by Demographics")

## By default, replace = False
n = 2000
random_sample = census.sample(n, replace = False)

random_sample["Barbie"].mean()

np.float64(0.538)

actual_barbie

np.float64(0.5302792307692308)

n = 800
random_sample = census.sample(n, replace = False)

# Compute the sample average and the resulting relative error
sample_barbie = random_sample["Barbie"].mean()
err = abs(sample_barbie-actual_barbie)/actual_barbie

# We can print output with Markdown formatting too...
from IPython.display import Markdown
Markdown(f"**Actual** = {actual_barbie:.4f}, **Sample** = {sample_barbie:.4f}, "
         f"**Err** = {100*err:.2f}%.")

nrep = 1000   # number of simulations
n = 800       # size of our sample
poll_result = []
for i in range(0, nrep):
    random_sample = census.sample(n, replace = False)
    poll_result.append(random_sample["Barbie"].mean())

fig = px.histogram(poll_result, histnorm='probability density', nbins=50)
fig.add_vline(x=actual_barbie, line_width=3, line_dash="dash", line_color="orange")
fig.update_layout(showlegend=False)

# Add Kernel Density Estimate curve
from scipy import stats
from plotly import graph_objects as go
x = np.linspace(min(poll_result), max(poll_result), 100)
fig.add_trace(go.Scatter(
    x=x, 
    y=stats.gaussian_kde(poll_result)(x), # Library for KDE (auto selects bandwidth)
    mode='lines', line=dict(color='red', width=3)) # Formatting
    )

sns.histplot(poll_result, stat='density', kde=True);
plt.axvline(actual_barbie, color='orange', linestyle='dashed', linewidth=2)

<matplotlib.lines.Line2D at 0x16bde76d0>

poll_result = pd.Series(poll_result)
np.sum(poll_result > 0.5)/1000

np.float64(0.944)

np.random.multinomial(100, [0.60, 0.30, 0.10])

array([61, 25, 14])

np.random.multinomial(100, [0.60, 0.30, 0.10], size=20)

array([[56, 35,  9],
       [64, 26, 10],
       [54, 40,  6],
       [61, 27, 12],
       [60, 27, 13],
       [59, 32,  9],
       [55, 34, 11],
       [63, 27, 10],
       [63, 26, 11],
       [55, 36,  9],
       [58, 31, 11],
       [55, 35, 10],
       [65, 26,  9],
       [56, 34, 10],
       [54, 32, 14],
       [59, 36,  5],
       [61, 28, 11],
       [55, 41,  4],
       [62, 30,  8],
       [56, 31, 13]])

Lecture 9 – Data 100, Fall 2024¶

Barbie v. Oppenheimer¶

Convenience sample: Undergrads in Prof. Gonzalez OH¶

Convenience sample: Elderly at a Campus Event¶

Check for bias¶

Simple Random Sample¶

Quantifying chance error¶

Simulating from a Multinomial Distribution¶

Marbles¶

	age	wears_birkenstocks	movie	Barbie
0	35	False	Barbie	True
1	42	True	Oppenheimer	False
2	55	False	Barbie	True
3	77	True	Oppenheimer	False
4	31	False	Barbie	True
...	...	...	...	...
1299995	62	True	Barbie	True
1299996	78	True	Oppenheimer	False
1299997	68	False	Oppenheimer	False
1299998	82	True	Oppenheimer	False
1299999	23	False	Barbie	True

	age	wears_birkenstocks	Barbie
0	18	False	0.819594
1	18	True	0.667001
2	19	False	0.812214
3	19	True	0.661252
4	20	False	0.805281
...	...	...	...
125	80	True	0.259731
126	81	False	0.394946
127	81	True	0.256759
128	82	False	0.398970
129	82	True	0.248060