# linear algebra, probability
import numpy as np

# data manipulation
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

## interactive visualization library
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px


# pd stands for pandas, which we will learn starting from tomorrow
# some pandas syntax shared with data8's datascience package
majors = pd.read_csv("data/majors.csv")
names = pd.read_csv("data/names.csv")


majors.head(20)


names.head()


names['Name'] = names['Name'].str.lower()


names.head()


print(len(names))
print(len(majors))

176
176


names.head()


names['Role'].value_counts().to_frame()  # counts of unique Roles


# boolean index to find rows where Role is #REF!
names[names['Role'] == "#REF!"]


names = names[names['Role'] != "#REF!"]


names['Role'].value_counts().to_frame()  # again, counts of unique Roles


majors.columns   # get column names

Index(['Majors', 'Terms in Attendance'], dtype='object')


majors['Terms in Attendance'].value_counts().to_frame()


majors[majors['Terms in Attendance'] == "#REF!"]


majors = majors[majors['Terms in Attendance'] != "#REF!"]
majors['Terms in Attendance'].value_counts().to_frame()


names.describe()


majors.describe()


majors_count = (       # method chaining in pandas
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False) # highest first
    .to_frame()
    .head(20)          # get the top 20
)

# or, comment out to parse double majors
# majors_count = (
#     majors['Majors']
#     .str.split(", ") # get double majors
#     .explode()       # one major to every row
#     .value_counts()
#     .sort_values(ascending=True)
#     .to_frame()
#     .tail(20)
# )

majors_count


# interactive using plotly
fig = px.bar(majors_count, orientation='h')
fig.update_layout(showlegend=False,
                  xaxis_title='Count',
                  yaxis_title='Major')


fig = px.histogram(majors['Terms in Attendance'].sort_values(),
                   histnorm='probability')
fig.update_layout(showlegend=False,
                  xaxis_title="Term",
                  yaxis_title="Fraction of Class")


print(majors.columns)
print(names.columns)

Index(['Majors', 'Terms in Attendance'], dtype='object')
Index(['Name', 'Role'], dtype='object')


import urllib.request
import os.path

# Download data from the web directly
data_url = "https://www.ssa.gov/oact/babynames/names.zip"
local_filename = "babynames.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

        
# Load data without unzipping the file
import zipfile
babynames = [] 
with zipfile.ZipFile(local_filename, "r") as zf:
    data_files = [f for f in zf.filelist if f.filename[-3:] == "txt"]
    def extract_year_from_filename(fn):
        return int(fn[3:7])
    for f in data_files:
        year = extract_year_from_filename(f.filename)
        with zf.open(f) as fp:
            df = pd.read_csv(fp, names=["Name", "Sex", "Count"])
            df["Year"] = year
            babynames.append(df)
babynames = pd.concat(babynames)


babynames.head() # show the first few rows


babynames


babynames['Name'] = babynames['Name'].str.lower()
babynames.head()


format(babynames['Count'].sum(), ',d') # sum of 'Count' column

'365,296,191'


format(len(babynames), ',d')       # number of rows

'2,085,158'


# how many Nora's were born in 2018?
babynames[(babynames['Name'] == 'nora') & (babynames['Year'] == 2018)]


# how many baby names contain the word "data"?
babynames.query('Name.str.contains("data")', engine='python')


# counts number of M and F babies per year
year_sex = pd.pivot_table(
        babynames, 
        index=['Year'], # the row index
        columns=['Sex'], # the column values
        values='Count', # the field(s) to processed in each group
        aggfunc=np.sum,
    )[["M", "F"]]

year_sex.head()


# more interactive using plotly
fig = px.line(year_sex)
fig.update_layout(title="Total Babies per Year",
                  yaxis_title="Number of Babies")


# counts number of M and F *names* per year
year_sex_unique = pd.pivot_table(babynames, 
        index=['Year'], 
        columns=['Sex'], 
        values='Name', 
        aggfunc=lambda x: len(np.unique(x)),
    )
fig = px.line(year_sex_unique)
fig.update_layout(title="Unique Names Per Year",
                  yaxis_title="Number of Baby Names")


# counts number of M and F babies per name
name_sex = pd.pivot_table(babynames, index='Name', columns='Sex', values='Count',
                            aggfunc='sum', fill_value=0., margins=True)
name_sex.sample(5)


name_sex.loc["alex",]

Sex
F        9558
M      281054
All    290612
Name: alex, dtype: int64


prop_female = (name_sex['F'] / name_sex['All']).rename("Prop. Female")
prop_female.to_frame().sample(10)


prop_female["bella"]

0.9994112105511069


prop_female["dominic"]

0.006654203581782221


prop_female["joey"]

0.11911693756342644


prop_female["ani"]

0.9983204568357407


prop_female["minh"]

0.12516411378555797


prop_female["pat"]

0.6001286385257426


prop_female["jaspreet"]

0.6043956043956044


def sex_from_name(name):
    lower_name = name.lower()
    if lower_name not in prop_female.index or prop_female[lower_name] == 0.5:
        return "Unknown"
    elif prop_female[lower_name] > 0.5:
        return "F"
    else:
        return "M"


sex_from_name("nora")

'F'


sex_from_name("joey")

'M'


sex_from_name("pat")

'F'


# apply sex_from_name to each student name
names['Pred. Sex'] = names['Name'].apply(sex_from_name)
px.bar(names['Pred. Sex'].value_counts()/len(names))


count_by_sex = names['Pred. Sex'].value_counts().to_frame()
count_by_sex


count_by_sex.loc['F']/(count_by_sex.loc['M'] + count_by_sex.loc['F'])

Pred. Sex    0.395683
dtype: float64


print("Fraction of names in the babynames data:", 
      names['Name'].isin(prop_female.index).mean())

Fraction of names in the babynames data: 0.7942857142857143


# the tilde ~ negates the boolean index. More next week.
names[~names['Name'].isin(prop_female.index)].sample(10)


"hangxing" in prop_female.index

False


# add the computed SSN F proportion to each row. 0.5 for Unknowns.
# merge() effectively "join"s two tables together. to be covered next week.
names['Prop. Female'] = (
    names[['Name']].merge(prop_female, how='left', left_on='Name', 
                          right_index=True)['Prop. Female']
        .fillna(0.5)
)
names.head(10)


# if a randomly picked number from [0.0, 1.0) is under the Female proportion, then F
names['Sim. Female'] = np.random.rand(len(names)) < names['Prop. Female']
names.tail(20)


# proportion of Trues in the 'Sim. Female' column
names['Sim. Female'].mean()

0.4057142857142857


# function that performs many simulations
def simulate_class(students):
    is_female = names['Prop. Female'] > np.random.rand(len(names['Prop. Female'])) 
    return np.mean(is_female)

sim_frac_female = np.array([simulate_class(names) for n in range(10000)])


fig = ff.create_distplot([sim_frac_female], ['Fraction Female'], bin_size=0.0025, show_rug=False)
fig.update_layout(xaxis_title='Prop. Female',
                  yaxis_title='Percentage',
                  title='Distribution of Simulated Proportions of Females in the Class')
ax = sns.histplot(sim_frac_female, stat='probability', kde=True, bins=20)
sns.rugplot(sim_frac_female, ax=ax)
ax.set_xlabel("Fraction Female")
ax.set_title('Distribution of Simulated Fractions Female in the Class');


# students who were not in the SSN Baby Names Dataset
names[~names['Name'].isin(prop_female.index)].sample(10)


subset_names = ["edris", "jamie", "jordan", "leslie", "taylor", "willie"]
subset_babynames_year = (pd.pivot_table(
                    babynames[babynames['Name'].isin(subset_names)],
                    index=['Name', 'Year'], columns='Sex', values='Count',
                    aggfunc='sum', fill_value=0, margins=True)
                 .drop(labels='All', level=0, axis=0) # drop cumulative row
                 .rename_axis(None, axis=1) # remove pivot table col name
                 .reset_index() # move (name, year) back into columns
                 .assign(Propf = lambda row: row.F/(row.F + row.M))
                )
ax = sns.lineplot(data=subset_babynames_year,
                  x='Year', y='Propf', hue='Name')
ax.set_title("Ratio of Female Babies over Time for Select Names")
ax.set_ylabel("Proportion of Female Names in a Year")
ax.legend(loc="lower left");


"""
get a subset of names that:
    have had propf above a threshold, as well as
    have been counted for more than a certain number of years
Note: while we could do our analysis over all names,
    it turns out many names don't matter.
    So to save computation power, we just work
    with a subset of names we know may be candidates.
"""
# these are thresholds we set as data analysts
propf_min = 0.2
propf_max = 0.8
year_thresh = 30

propf_countyear = (babynames
                   .groupby('Name').count()
                   .merge(prop_female.to_frame(), on='Name')
                   .rename(columns={'Prop. Female': 'Propf'})
                   .query("@propf_min < Propf < @propf_max & Year > @year_thresh & Name != 'All'")
                  )[['Propf', 'Year']]
propf_countyear


# construct a pivot table of (name, year) to count
keep_names = propf_countyear.reset_index()['Name']
name_year_sex = (pd.pivot_table(
                    babynames[babynames['Name'].isin(keep_names)],
                    index=['Name', 'Year'], columns='Sex', values='Count',
                    aggfunc='sum', fill_value=0, margins=True)
                 .drop(labels='All', level=0, axis=0) # drop cumulative row
                 .rename_axis(None, axis=1) # remove pivot table col name
                 .reset_index() # move (name, year) back into columns
                 .assign(Propf = lambda row: row.F/(row.F + row.M))
                )
name_year_sex


"""
Compute two statistics per name:
- Count of number of babies with name
- Variance of proportion of females
  (i.e., how much the proportion of females varied
  across different years)
"""
names_to_include = 40
group_names =  (name_year_sex
                       .groupby('Name')
                       .agg({'Propf': 'var', 'All': 'sum'})
                       .rename(columns={'Propf': 'Propf Var', 'All': 'Total'})
                       .reset_index()
                      )


# pick some high variance names
high_variance_names = (group_names
                       .sort_values('Propf Var', ascending=False)
                       .head(names_to_include)
                       .sort_values('Total', ascending=False)
                      )

high_variance_names.head(5)


# pick some common names
common_names = (group_names
                .sort_values('Total', ascending=False)
                .head(names_to_include)
               )
common_names.head(10)

	Majors	Terms in Attendance
0	Environmental Sciences BS	8
1	Letters & Sci Undeclared UG	7
2	Letters & Sci Undeclared UG	3
3	Data Science BA	8
4	Computer Science BA	5
5	Civil Engineering BS	8
6	Data Science BA	8
7	Summer Internatnl Visitor UG	U
8	Data Science BA	5
9	Data Science BA	3
10	Cognitive Science BA	8
11	Applied Mathematics BA	7
12	Energy Engineering BS	7
13	Electrical Eng & Comp Sci BS	8
14	Computer Science BA	7
15	Letters & Sci Undeclared UG	8
16	Data Science BA, Sociology BA	8
17	Data Science BA	5
18	Computer Science BA, Mathematics BA	7
19	Civil Engineering BS	8

	Name	Role
0	Leah	Student
1	Mariano	Student
2	Jocelyn	Student
3	LOGAN	Student
4	Shaojie	Student

	Name	Role
0	leah	Student
1	mariano	Student
2	jocelyn	Student
3	logan	Student
4	shaojie	Student

	Name	Role
0	leah	Student
1	mariano	Student
2	jocelyn	Student
3	logan	Student
4	shaojie	Student

	Majors
Letters & Sci Undeclared UG	43
Data Science BA	34
Computer Science BA	11
Summer Internatnl Visitor UG	8
Chemical Engineering BS	6
Civil Engineering BS	6
Applied Mathematics BA	5
Electrical Eng & Comp Sci BS	5
Molecular & Cell Biology BA	4
Cognitive Science BA	4
Environ Econ & Policy BS	3
Economics BA	3
Summer Domestic Visitor UG	3
Environmental Sciences BS	2
Computer Science BA, Mathematics BA	2
Summer Internatnl Visitor UG, UCBX Concurrent International	2
Chemical Biology BS, Letters & Sci Undeclared UG	2
Sociology BA	2
Mathematics BA	2
Applied Mathematics BA, Computer Science BA	2

Lecture 1 – Data 100, Summer 2023¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

How many records do we have?¶

Understanding the structure of data¶

Summarizing the Data¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

Diversity and Data Science:¶

What fraction of the students are female?¶

Reworded: What is the gender diversity of our students?¶

How could we answer this question?¶

We don't have the data.¶

(1) We could run a survey!¶

(2) ... or we could try to use the data we have to estimate the _sex_ of the students as a proxy for gender?!?!¶

1. What does a name tell us about a person?¶

2. Acquire data programatically¶

2 (cont). Understanding the Setting¶

A little bit of data cleaning¶

3. Exploratory Data Analysis (and Visualization)¶

Temporal Patterns Conditioned on Male/Female¶

How many unique names for each year?¶

4. Understand the World: Prediction and Inference¶

Compute the Proportion of Female Babies For Each Name¶

Test a few names¶

Next, Build a Simple Classifier (Model)¶

4 (cont). Estimating the fraction of female and male students in Data 100¶

Interpreting the unknowns¶

Using simulation to estimate uncertainty¶

Running the simulation¶

Limitations of Baby Names dataset¶

UC Berkeley teaches students from around the world.¶

Names change over time.¶

Bonus: How we selected which names to plot¶

	Name	Sex	Count	Year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

	Name	Sex	Count	Year
9766	kidata	F	5	1975
24914	datavion	M	5	1995
23614	datavious	M	7	1997
12103	datavia	F	7	2000
27510	datavion	M	6	2001
28918	datari	M	5	2001
29144	datavian	M	5	2002
29145	datavious	M	5	2002
30576	datavion	M	5	2004
17141	datavia	F	5	2005
31035	datavion	M	5	2005
31028	datavion	M	6	2006
33345	datavious	M	5	2007
33346	datavius	M	5	2007
33416	datavious	M	5	2008
33095	datavion	M	5	2009
32510	datavious	M	5	2010

Sex	M	F
Year
1880	110490	90994
1881	100737	91953
1882	113686	107847
1883	104625	112320
1884	114442	129019

Sex	F	M	All
Name
shaquaya	96	0	96
zenniyah	5	0	5
sarahanne	262	0	262
kinnick	5	490	495
alberico	0	6	6

	Prop. Female
Name
yimi	0.0
donold	0.0
eilam	0.0
kathleene	1.0
xabier	0.0
taeleigh	1.0
nicolaus	0.0
sarabella	1.0
cadee	1.0
sairus	0.0

	Name	Role	Pred. Sex
28	youyansu	Student	Unknown
161	qingyang	Student	Unknown
4	shaojie	Student	Unknown
130	qiuxuan	Student	Unknown
131	canhui	Student	Unknown
110	ayeon	Student	Unknown
125	haorui	Student	Unknown
152	jasleem	Student	Unknown
85	yanfu	Student	Unknown
172	zhuoxuan	Student	Unknown

	Name	Role	Pred. Sex	Prop. Female
0	leah	Student	F	0.998094
1	mariano	Student	M	0.000000
2	jocelyn	Student	F	0.996825
3	logan	Student	M	0.073723
4	shaojie	Student	Unknown	0.500000
5	janmesh	Student	Unknown	0.500000
6	shaun	Student	M	0.042693
7	cara	Student	F	0.998826
8	karl	Student	M	0.009020
9	haonan	Student	Unknown	0.500000

	Name	Role	Pred. Sex	Prop. Female	Sim. Female
156	brett	Student	M	0.020783	False
157	teddy	Student	M	0.046772	False
158	yifei	Student	F	1.000000	True
159	jeffae	Student	Unknown	0.500000	True
160	sriya	Student	F	1.000000	True
161	qingyang	Student	Unknown	0.500000	False
162	tan	Student	M	0.018425	False
163	xuanning	Student	Unknown	0.500000	True
164	xuanjie	Student	Unknown	0.500000	True
165	alexa	Student	F	0.997828	True
166	bryan	Student	M	0.004424	False
167	seoin	Student	Unknown	0.500000	False
168	aaron	Student	M	0.007205	False
169	xinyi	Student	F	1.000000	True
170	tisha	Student	F	1.000000	True
171	zhen	Student	M	0.197368	False
172	zhuoxuan	Student	Unknown	0.500000	False
173	suge	Student	Unknown	0.500000	False
174	king	Student	M	0.000979	False
175	samantha	Student	F	0.997949	True

	Name	Role	Pred. Sex	Prop. Female	Sim. Female
29	yuwen	Student	Unknown	0.5	False
131	canhui	Student	Unknown	0.5	True
28	youyansu	Student	Unknown	0.5	True
144	pujitha	Student	Unknown	0.5	True
44	juelius	Student	Unknown	0.5	False
115	zcjanin	Student	Unknown	0.5	True
81	khalled	Student	Unknown	0.5	True
9	haonan	Student	Unknown	0.5	False
20	nathaniel-vivi	Student	Unknown	0.5	True
71	xiahe	Student	Unknown	0.5	True

	Propf	Year
Name
aalijah	0.352246	40
aamari	0.380074	35
aaren	0.240409	74
aarin	0.270723	77
aarion	0.221212	59
...	...	...
zephyr	0.229419	76
ziah	0.742308	45
ziyan	0.432911	36
zohar	0.643636	34
zuriel	0.200661	63

	Name	Sex	Count	Year
29	nora	F	5836	2018
28545	nora	M	7	2018

	Name	Propf Var	Total
19	aime	0.238514	2270
962	linzy	0.229197	1518
979	luan	0.252101	1255
468	edris	0.230026	1050
961	linzie	0.246861	900

	Name	Propf Var	Total
1586	willie	0.026477	595921
729	jordan	0.011999	525151
1484	taylor	0.110481	439450
950	leslie	0.147733	381953
653	jamie	0.021096	357061
70	angel	0.032182	349577
938	lee	0.005389	294727
706	jessie	0.027124	279884
1026	marion	0.017836	261205
358	dana	0.059549	246020

	Name	Year	F	M	All	Propf
0	aalijah	1994	5	0	5	1.000000
1	aalijah	1995	5	0	5	1.000000
2	aalijah	1999	5	0	5	1.000000
3	aalijah	2001	9	0	9	1.000000
4	aalijah	2002	9	6	15	0.600000
...	...	...	...	...	...	...
89419	zuriel	2018	28	44	72	0.388889
89420	zuriel	2019	25	64	89	0.280899
89421	zuriel	2020	31	63	94	0.329787
89422	zuriel	2021	26	59	85	0.305882
89423	zuriel	2022	24	69	93	0.258065

Lecture 1 – Data 100, Summer 2023¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

How many records do we have?¶

Understanding the structure of data¶

Summarizing the Data¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

Diversity and Data Science:¶

What fraction of the students are female?¶

Reworded: What is the gender diversity of our students?¶

How could we answer this question?¶

We don't have the data.¶

(1) We could run a survey!¶

(2) ... or we could try to use the data we have to estimate the _sex_ of the students as a proxy for gender?!?!¶

US Social Security Data¶

1. What does a name tell us about a person?¶

2. Acquire data programatically¶

2 (cont). Understanding the Setting¶

A little bit of data cleaning¶

3. Exploratory Data Analysis (and Visualization)¶

Temporal Patterns Conditioned on Male/Female¶

How many unique names for each year?¶

4. Understand the World: Prediction and Inference¶

Compute the Proportion of Female Babies For Each Name¶

Test a few names¶

Next, Build a Simple Classifier (Model)¶

4 (cont). Estimating the fraction of female and male students in Data 100¶

Interpreting the unknowns¶

Using simulation to estimate uncertainty¶

Running the simulation¶

Limitations of Baby Names dataset¶

UC Berkeley teaches students from around the world.¶

Names change over time.¶

Bonus: How we selected which names to plot¶