# linear algebra, probability
import numpy as np

# data manipulation
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

## interactive visualization library
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px


# pd stands for pandas, which we will learn next week
# some pandas syntax shared with data8's datascience package
majors = pd.read_csv("data/majors.csv")
names = pd.read_csv("data/names.csv")


majors.head(20)


names.head()


names['Name'] = names['Name'].str.lower()


names.head()


print(len(names))
print(len(majors))

1287
1287


names.head()


names['Role'].value_counts().to_frame()  # counts of unique Roles


# boolean index to find rows where Role is #REF!
names[names['Role'] == "#REF!"]


names = names[names['Role'] != "#REF!"]


names['Role'].value_counts().to_frame()  # again, counts of unique Roles


majors.columns   # get column names

Index(['Majors', 'Terms in Attendance'], dtype='object')


majors['Terms in Attendance'].value_counts().to_frame()


majors[majors['Terms in Attendance'] == "#REF!"]


majors = majors[majors['Terms in Attendance'] != "#REF!"]
majors['Terms in Attendance'].value_counts().to_frame()


names.describe()


majors.describe()


majors_count = (       # method chaining in pandas
    majors['Majors']
    .value_counts()
    .sort_values(ascending=True) # lowest first
    .to_frame()
    .tail(20)          # get the top 20
)

# or, comment out to parse double majors
# majors_count = (
#     majors['Majors']
#     .str.split(", ") # get double majors
#     .explode()       # one major to every row
#     .value_counts()
#     .sort_values(ascending=True)
#     .to_frame()
#     .tail(20)
# )

majors_count


# interactive using plotly
fig = px.bar(majors_count, orientation='h')
fig.update_layout(showlegend=False,
                  xaxis_title='Count',
                  yaxis_title='Major')


fig = px.histogram(majors['Terms in Attendance'].sort_values(),
                   histnorm='probability')
fig.update_layout(showlegend=False,
                  xaxis_title="Term",
                  yaxis_title="Fraction of Class")


print(majors.columns)
print(names.columns)

Index(['Majors', 'Terms in Attendance'], dtype='object')
Index(['Name', 'Role'], dtype='object')


import urllib.request
import os.path

# Download data from the web directly
data_url = "https://www.ssa.gov/oact/babynames/names.zip"
local_filename = "babynames.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

        
# Load data without unzipping the file
import zipfile
babynames = [] 
with zipfile.ZipFile(local_filename, "r") as zf:
    data_files = [f for f in zf.filelist if f.filename[-3:] == "txt"]
    def extract_year_from_filename(fn):
        return int(fn[3:7])
    for f in data_files:
        year = extract_year_from_filename(f.filename)
        with zf.open(f) as fp:
            df = pd.read_csv(fp, names=["Name", "Sex", "Count"])
            df["Year"] = year
            babynames.append(df)
babynames = pd.concat(babynames)


babynames.head() # show the first few rows


babynames


babynames['Name'] = babynames['Name'].str.lower()
babynames.head()


format(babynames['Count'].sum(), ',d') # sum of 'Count' column

'361,888,233'


format(len(babynames), ',d')       # number of rows

'2,052,781'


# how many Nora's were born in 2018?
babynames[(babynames['Name'] == 'nora') & (babynames['Year'] == 2018)]


# how many baby names contain the word "data"?
babynames.query('Name.str.contains("data")', engine='python')


# counts number of M and F babies per year
year_sex = pd.pivot_table(
        babynames, 
        index=['Year'], # the row index
        columns=['Sex'], # the column values
        values='Count', # the field(s) to processed in each group
        aggfunc=np.sum,
    )[["M", "F"]]

year_sex.head()


# more interactive using plotly
fig = px.line(year_sex)
fig.update_layout(title="Total Babies per Year",
                  yaxis_title="Number of Babies")


# counts number of M and F *names* per year
year_sex_unique = pd.pivot_table(babynames, 
        index=['Year'], 
        columns=['Sex'], 
        values='Name', 
        aggfunc=lambda x: len(np.unique(x)),
    )
fig = px.line(year_sex_unique)
fig.update_layout(title="Unique Names Per Year",
                  yaxis_title="Number of Baby Names")


# counts number of M and F babies per name
name_sex = pd.pivot_table(babynames, index='Name', columns='Sex', values='Count',
                            aggfunc='sum', fill_value=0., margins=True)
name_sex.head()


prop_female = (name_sex['F'] / name_sex['All']).rename("Prop. Female")
prop_female.to_frame().head(10)


prop_female["lisa"]

0.9971219500105333


prop_female["narges"]

1.0


prop_female["josh"]

0.0


prop_female["avery"]

0.7058240944624468


prop_female["min"]

0.37598736176935227


prop_female["pat"]

0.600140600694029


prop_female["jaspreet"]

0.6043956043956044


def sex_from_name(name):
    lower_name = name.lower()
    if lower_name not in prop_female.index or prop_female[lower_name] == 0.5:
        return "Unknown"
    elif prop_female[lower_name] > 0.5:
        return "F"
    else:
        return "M"


sex_from_name("nora")

'F'


sex_from_name("josh")

'M'


sex_from_name("pat")

'F'


# apply sex_from_name to each student name
names['Pred. Sex'] = names['Name'].apply(sex_from_name)
px.bar(names['Pred. Sex'].value_counts()/len(names))


count_by_sex = names['Pred. Sex'].value_counts().to_frame()
count_by_sex


count_by_sex.loc['F']/(count_by_sex.loc['M'] + count_by_sex.loc['F'])

Pred. Sex    0.464674
dtype: float64


print("Fraction of names in the babynames data:", 
      names['Name'].isin(prop_female.index).mean())

Fraction of names in the babynames data: 0.8584758942457231


# the tilde ~ negates the boolean index. More next week.
names[~names['Name'].isin(prop_female.index)].sample(10)


# add the computed SSN F proportion to each row. 0.5 for Unknowns.
# merge() effectively "join"s two tables together. to be covered next week.
names['Prop. Female'] = (
    names[['Name']].merge(prop_female, how='left', left_on='Name', 
                          right_index=True)['Prop. Female']
        .fillna(0.5)
)
names.head(10)


# if a randomly picked number from [0.0, 1.0) is under the Female proportion, then F
names['Sim. Female'] = np.random.rand(len(names)) < names['Prop. Female']
names.tail(20)


# proportion of Trues in the 'Sim. Female' column
names['Sim. Female'].mean()

0.49144634525660963


# function that performs many simulations
def simulate_class(students):
    is_female = names['Prop. Female'] > np.random.rand(len(names['Prop. Female'])) 
    return np.mean(is_female)

sim_frac_female = np.array([simulate_class(names) for n in range(10000)])


fig = ff.create_distplot([sim_frac_female], ['Fraction Female'], bin_size=0.0025, show_rug=False)
fig.update_layout(xaxis_title='Prop. Female',
                  yaxis_title='Percentage',
                  title='Distribution of Simulated Proportions of Females in the Class')
ax = sns.histplot(sim_frac_female, stat='probability', kde=True, bins=20)
sns.rugplot(sim_frac_female, ax=ax)
ax.set_xlabel("Fraction Female")
ax.set_title('Distribution of Simulated Fractions Female in the Class');


# students who were not in the SSN Baby Names Dataset
names[~names['Name'].isin(prop_female.index)].sample(10)


subset_names = ["edris", "jamie", "jordan", "leslie", "taylor", "willie"]
subset_babynames_year = (pd.pivot_table(
                    babynames[babynames['Name'].isin(subset_names)],
                    index=['Name', 'Year'], columns='Sex', values='Count',
                    aggfunc='sum', fill_value=0, margins=True)
                 .drop(labels='All', level=0, axis=0) # drop cumulative row
                 .rename_axis(None, axis=1) # remove pivot table col name
                 .reset_index() # move (name, year) back into columns
                 .assign(Propf = lambda row: row.F/(row.F + row.M))
                )
ax = sns.lineplot(data=subset_babynames_year,
                  x='Year', y='Propf', hue='Name')
ax.set_title("Ratio of Female Babies over Time for Select Names")
ax.set_ylabel("Proportion of Female Names in a Year")
ax.legend(loc="lower left");


"""
get a subset of names that:
    have had propf above a threshold, as well as
    have been counted for more than a certain number of years
Note: while we could do our analysis over all names,
    it turns out many names don't matter.
    So to save computation power, we just work
    with a subset of names we know may be candidates.
"""
# these are thresholds we set as data analysts
propf_min = 0.2
propf_max = 0.8
year_thresh = 30

propf_countyear = (babynames
                   .groupby('Name').count()
                   .merge(prop_female.to_frame(), on='Name')
                   .rename(columns={'Prop. Female': 'Propf'})
                   .query("@propf_min < Propf < @propf_max & Year > @year_thresh & Name != 'All'")
                  )[['Propf', 'Year']]
propf_countyear


# construct a pivot table of (name, year) to count
keep_names = propf_countyear.reset_index()['Name']
name_year_sex = (pd.pivot_table(
                    babynames[babynames['Name'].isin(keep_names)],
                    index=['Name', 'Year'], columns='Sex', values='Count',
                    aggfunc='sum', fill_value=0, margins=True)
                 .drop(labels='All', level=0, axis=0) # drop cumulative row
                 .rename_axis(None, axis=1) # remove pivot table col name
                 .reset_index() # move (name, year) back into columns
                 .assign(Propf = lambda row: row.F/(row.F + row.M))
                )
name_year_sex


"""
Compute two statistics per name:
- Count of number of babies with name
- Variance of proportion of females
  (i.e., how much the proportion of females varied
  across different years)
"""
names_to_include = 40
group_names =  (name_year_sex
                       .groupby('Name')
                       .agg({'Propf': 'var', 'All': 'sum'})
                       .rename(columns={'Propf': 'Propf Var', 'All': 'Total'})
                       .reset_index()
                      )


# pick some high variance names
high_variance_names = (group_names
                       .sort_values('Propf Var', ascending=False)
                       .head(names_to_include)
                       .sort_values('Total', ascending=False)
                      )

high_variance_names.head(5)


# pick some common names
common_names = (group_names
                .sort_values('Total', ascending=False)
                .head(names_to_include)
               )
common_names.head(10)

	Majors	Terms in Attendance
0	Environ Health Sciences PhD	G
1	Molecular & Cell Biology BA	6
2	Applied Mathematics BA, Computer Science BA	8
3	Electrical Eng & Comp Sci BS	4
4	Public Health MPH	G
5	Letters & Sci Undeclared UG	6
6	Computer Science BA, Data Science BA	6
7	Applied Mathematics BA, Computer Science BA	4
8	Chemical Biology BS, Computer Science BA	6
9	Electrical Eng & Comp Sci BS	4
10	Applied Mathematics BA	4
11	Letters & Sci Undeclared UG	4
12	Bioengineering BS	8
13	Letters & Sci Undeclared UG	8
14	Environ Econ & Policy BS, Letters & Sci Undecl...	6
15	Letters & Sci Undeclared UG	4
16	Letters & Sci Undeclared UG	4
17	Letters & Sci Undeclared UG, Nutritional Scien...	4
18	Letters & Sci Undeclared UG	6
19	Letters & Sci Undeclared UG	6

	Name	Role
0	Yue	Student
1	Kevin	Student
2	Ryan	Student
3	JOHN	Student
4	Alex	Student

	Name	Role
0	yue	Student
1	kevin	Student
2	ryan	Student
3	john	Student
4	alex	Student

	Name	Role
0	yue	Student
1	kevin	Student
2	ryan	Student
3	john	Student
4	alex	Student

	Role
Student	1201
Waitlist Student	85

Lecture 1 – Data 100, Spring 2023¶

Software Packages¶

1. Starting with a Question: Who are you (the students of DS100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

How many records do we have?¶

Understanding the structure of data¶

Summarizing the Data¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

Diversity and Data Science:¶

What fraction of the students are female?¶

Reworded: What is the gender diversity of our students?¶

How could we answer this question?¶

We don't have the data.¶

(1) We could run a survey!¶

(2) ... or we could try to use the data we have to estimate the _sex_ of the students as a proxy for gender?!?!¶

1. What does a name tell us about a person?¶

2. Acquire data programatically¶

2 (cont). Understanding the Setting¶

A little bit of data cleaning¶

3. Exploratory Data Analysis (and Visualization)¶

Temporal Patterns Conditioned on Male/Female¶

How many unique names for each year?¶

4. Understand the World: Prediction and Inference¶

Compute the Proportion of Female Babies For Each Name¶

Test a few names¶

Next, Build a Simple Classifier (Model)¶

4 (cont). Estimating the fraction of female and male students in DS100¶

Interpreting the unknowns¶

Using simulation to estimate uncertainty¶

Running the simulation¶

Limitations of Baby Names dataset¶

UC Berkeley teaches students from around the world.¶

Names change over time.¶

Bonus: How we selected which names to plot¶

	Majors	Terms in Attendance
count	1286	1286
unique	155	8
top	Letters & Sci Undeclared UG	4
freq	543	540

	Majors
Chemical Biology BS	7
Bioengineering BS	7
Psychology BA	7
Applied Mathematics BA, Computer Science BA	7
Business Administration BS	9
Chemical Engineering BS	10
Info Mgmt & Systems MIMS	10
Business Administration MBA	10
Environ Econ & Policy BS	10
Electrical Eng & Comp Sci MEng	13
Applied Mathematics BA	15
Molecular & Cell Biology BA	18
Public Health MPH	20
Cognitive Science BA	27
Civil Engineering BS	34
Electrical Eng & Comp Sci BS	53
Economics BA	59
Data Science BA	77
Computer Science BA	87
Letters & Sci Undeclared UG	543

	Name	Sex	Count	Year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

	Name	Sex	Count	Year
9765	kidata	F	5	1975
24914	datavion	M	5	1995
23613	datavious	M	7	1997
12103	datavia	F	7	2000
27509	datavion	M	6	2001
28918	datari	M	5	2001
29140	datavian	M	5	2002
29141	datavious	M	5	2002
30573	datavion	M	5	2004
17140	datavia	F	5	2005
31032	datavion	M	5	2005
31023	datavion	M	6	2006
33344	datavious	M	5	2007
33345	datavius	M	5	2007
33408	datavious	M	5	2008
33091	datavion	M	5	2009
32500	datavious	M	5	2010

Sex	M	F
Year
1880	110490	90994
1881	100737	91953
1882	113686	107847
1883	104625	112319
1884	114442	129019

	Prop. Female
Name
aaban	0.0
aabha	1.0
aabid	0.0
aabidah	1.0
aabir	0.0
aabriella	1.0
aada	1.0
aadam	0.0
aadan	0.0
aadarsh	0.0

	Name	Role	Pred. Sex
468	sauhard	Student	Unknown
521	zilong	Waitlist Student	Unknown
1166	zcjanin	Student	Unknown
640	kamer	Student	Unknown
1122	limi	Student	Unknown
324	peiying	Student	Unknown
884	arunava	Student	Unknown
938	zeming	Student	Unknown
1017	xainab	Student	Unknown
1273	inho	Student	Unknown

	Name	Role	Pred. Sex	Prop. Female
0	yue	Student	F	0.532609
1	kevin	Student	M	0.004508
2	ryan	Student	M	0.026558
3	john	Student	M	0.004200
4	alex	Student	M	0.032599
5	cael	Waitlist Student	M	0.003687
6	angela	Student	F	0.996816
7	michael	Student	M	0.004941
8	sean	Student	M	0.006971
9	andrew	Student	M	0.003750

	Name	Role	Pred. Sex	Prop. Female	Sim. Female
1267	amy	Student	F	0.997355	True
1268	nithurhan	Student	Unknown	0.500000	True
1269	harpreet	Student	F	0.504970	False
1270	jaquelyn	Student	F	1.000000	True
1271	shujie	Student	Unknown	0.500000	True
1272	latifa	Student	F	1.000000	True
1273	inho	Student	Unknown	0.500000	True
1274	malavika	Student	F	1.000000	True
1275	mihir	Student	M	0.000000	False
1276	sanik	Student	Unknown	0.500000	False
1277	cindy	Student	F	0.997003	True
1278	raine	Student	F	0.843631	True
1279	allison	Waitlist Student	F	0.987817	True
1280	caleb	Student	M	0.002063	False
1281	robin	Waitlist Student	F	0.864324	True
1282	sam	Student	M	0.011800	False
1283	shivani	Student	F	1.000000	True
1284	harjot	Student	M	0.245509	False
1285	harshil	Student	M	0.000000	False
1286	zora	Student	F	0.997671	True

	Name	Role	Pred. Sex	Prop. Female	Sim. Female
782	hongbi	Student	Unknown	0.5	False
1119	sungsoo	Waitlist Student	Unknown	0.5	False
384	seoyoung	Student	Unknown	0.5	True
822	zhiqi	Student	Unknown	0.5	False
533	buyankhuu	Student	Unknown	0.5	False
965	weihao	Student	Unknown	0.5	True
1019	huaxiao	Student	Unknown	0.5	False
259	aswinkarthik	Student	Unknown	0.5	True
87	saiteja	Student	Unknown	0.5	True
778	filhaq	Student	Unknown	0.5	True

	Terms in Attendance
4	540
6	360
8	176
G	134
7	34
2	19
5	17
3	6

	Name	Sex	Count	Year
29	nora	F	5833	2018
28527	nora	M	7	2018

	Pred. Sex
M	591
F	513
Unknown	182

	Propf	Year
Name
aalijah	0.366995	39
aamari	0.377953	33
aaren	0.244367	73
aarin	0.283472	76
aarion	0.223926	58
...	...	...
zephyr	0.236254	74
ziah	0.746377	43
ziyan	0.446475	35
zohar	0.649057	32
zyan	0.201788	50

	Name	Year	F	M	All	Propf
0	aalijah	1994	5	0	5	1.000000
1	aalijah	1995	5	0	5	1.000000
2	aalijah	1999	5	0	5	1.000000
3	aalijah	2001	9	0	9	1.000000
4	aalijah	2002	9	6	15	0.600000
...	...	...	...	...	...	...
87522	zyan	2017	9	78	87	0.103448
87523	zyan	2018	6	88	94	0.063830
87524	zyan	2019	11	87	98	0.112245
87525	zyan	2020	6	86	92	0.065217
87526	zyan	2021	8	104	112	0.071429

	Name	Propf Var	Total
20	aime	0.239362	2253
946	linzy	0.227941	1509
964	luan	0.252438	1185
460	edris	0.227961	1033
945	linzie	0.246861	900

	Name	Propf Var	Total
1557	willie	0.026535	595724
718	jordan	0.012061	520518
1456	taylor	0.109349	437595
934	leslie	0.147521	381471
643	jamie	0.020710	356210
71	angel	0.032230	343815
921	lee	0.005340	294340
696	jessie	0.027282	279340
1009	marion	0.017952	261017
351	dana	0.059373	245740

Lecture 1 – Data 100, Spring 2023¶

Software Packages¶

1. Starting with a Question: Who are you (the students of DS100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

How many records do we have?¶

Understanding the structure of data¶

Summarizing the Data¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

Diversity and Data Science:¶

What fraction of the students are female?¶

Reworded: What is the gender diversity of our students?¶

How could we answer this question?¶

We don't have the data.¶

(1) We could run a survey!¶

(2) ... or we could try to use the data we have to estimate the _sex_ of the students as a proxy for gender?!?!¶

US Social Security Data¶

1. What does a name tell us about a person?¶

2. Acquire data programatically¶

2 (cont). Understanding the Setting¶

A little bit of data cleaning¶

3. Exploratory Data Analysis (and Visualization)¶

Temporal Patterns Conditioned on Male/Female¶

How many unique names for each year?¶

4. Understand the World: Prediction and Inference¶

Compute the Proportion of Female Babies For Each Name¶

Test a few names¶

Next, Build a Simple Classifier (Model)¶

4 (cont). Estimating the fraction of female and male students in DS100¶

Interpreting the unknowns¶

Using simulation to estimate uncertainty¶

Running the simulation¶

Limitations of Baby Names dataset¶

UC Berkeley teaches students from around the world.¶

Names change over time.¶

Bonus: How we selected which names to plot¶