import numpy as np
import pandas as pd
import plotly.express as px


import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()


# Ask yourself: why is :9 is the correct slice to select the first 10 rows?
babynames_first_10_rows = babynames.loc[:9, :]

babynames_first_10_rows


# Notice how we have exactly 10 elements in our boolean array argument
babynames_first_10_rows[[True, False, True, False, True, False, True, False, True, False]]


# First, use a logical condition to generate a boolean Series
logical_operator = babynames["Sex"] == "F"
logical_operator

0          True
1          True
2          True
3          True
4          True
          ...  
400757    False
400758    False
400759    False
400760    False
400761    False
Name: Sex, Length: 400762, dtype: bool


# Then, use this boolean Series to filter the DataFrame
babynames[logical_operator]


# Notice that we did not have to specify columns to select 
# If no columns are referenced, pandas will automatically select all columns
babynames.loc[babynames["Sex"] == "F"]


babynames[(babynames["Sex"] == "F") & (babynames["Year"] < 2000)]


# Note: The parentheses surrounding the code make it possible to break the code on to multiple lines for readability

(
    babynames[(babynames["Name"] == "Bella") | 
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Ani") |
              (babynames["Name"] == "Lisa")]
)


# A more concise method to achieve the above: .isin
names = ["Bella", "Alex", "Ani", "Lisa"]
babynames[babynames["Name"].isin(names)]


# What if we only want names that start with "N"?
babynames[babynames["Name"].str.startswith("N")]


# Create a Series of the length of each name
babyname_lengths = babynames["Name"].str.len()

# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babyname_lengths

babynames


# Modify the “name_lengths” column to be one less than its original value
babynames["name_lengths"] = babynames["name_lengths"]-1
babynames


# Rename “name_lengths” to “Length”
babynames = babynames.rename(columns={"name_lengths":"Length"})
babynames


import numpy as np
a = np.array([True, True, False])
b = np.array([True, False, False])
a & b

array([ True, False, False])


# Remove our new "Length" column
babynames = babynames.drop("Length", axis = "columns")
babynames


bella_counts = babynames[babynames["Name"] == "Bella"]["Count"]

# Average number of babies named Bella each year

np.mean(bella_counts)

270.1860465116279


# Max number of babies named Bella born in any single year

max(bella_counts)

902


# Return the shape of the object, in the format (num_rows, num_columns)
babynames.shape

(400762, 5)


# Return the total number of entries in the object, equal to num_rows * num_columns
babynames.size

2003810


# What summary statistics can we describe?
babynames.describe()


# Our statistics are slightly different when working with a Series
babynames["Sex"].describe()

count     400762
unique         2
top            F
freq      235791
Name: Sex, dtype: object


# Randomly sample row(s) from the DataFrame
babynames.sample()


# Rerun this cell a few times – you'll get different results!
babynames.sample(5)


# Sampling with replacement
babynames[babynames["Year"] == 2000].sample(4, replace = True)


# Count the number of times each unique value occurs in a Series
babynames["Name"].value_counts()

Jean         221
Francis      219
Guadalupe    216
Jessie       215
Marion       213
            ... 
Janin          1
Jilliann       1
Jomayra        1
Karess         1
Zyrus          1
Name: Name, Length: 20239, dtype: int64


# Return an array of all unique values in the Series
babynames["Name"].unique()

array(['Mary', 'Helen', 'Dorothy', ..., 'Zyire', 'Zylo', 'Zyrus'],
      dtype=object)


# Sort a Series
babynames["Name"].sort_values()

380256      Aadan
362255      Aadan
365374      Aadan
394460    Aadarsh
366561      Aaden
           ...   
232144      Zyrah
217415      Zyrah
197519      Zyrah
220674      Zyrah
400761      Zyrus
Name: Name, Length: 400762, dtype: object


# Sort a DataFrame – there are lots of Michaels in California
babynames.sort_values(by = "Count", ascending = False)


# The code below uses the full babynames dataset, which is why some numbers are different relative to the diagram
babynames.groupby("Year").agg(sum)


# What is the earliest year in which each name appeared?
babynames.groupby("Name")[["Year"]].agg(min)


# What is the largest single-year count of each name?
babynames.groupby("Name")[["Count"]].agg(max)


f_babynames = babynames[babynames["Sex"] == "F"]
f_babynames


# We sort the data by year
f_babynames = f_babynames.sort_values("Year")
f_babynames


# We'll talk about how to generate plots in Lecture 7
fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18)


# In the year with the highest Jennifer count, 6065 Jennifers were born
max_jenn = np.max(f_babynames[f_babynames["Name"] == "Jennifer"]["Count"])
max_jenn

6065


# Remember that we sorted f_babynames by year. 
# This means that grabbing the final entry gives us the most recent count of Jennifers: 91
# In 2021, the most recent year for which we have data, 91 Jennifers were born
curr_jenn = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"].iloc[-1]
curr_jenn

91


# Compute the RTP
curr_jenn / max_jenn

0.015004122011541632


def ratio_to_peak(series):
    """
    Compute the RTP for a Series containing the counts per year for a single name
    """
    return series.iloc[-1] / np.max(series)


# Construct a Series containing our Jennifer count data
jenn_counts_ser = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"]

# Then, find the RTP
ratio_to_peak(jenn_counts_ser)

0.015004122011541632


ratio_to_peak(f_babynames[f_babynames["Name"] == "Dominic"]["Count"])

0.5454545454545454


rtp_table = f_babynames.groupby("Name").agg(ratio_to_peak)
rtp_table

# Note: If this cell crashes, comment out the code and use the female_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak) instead

/tmp/ipykernel_719/2995088074.py:1: FutureWarning:

['State', 'Sex'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.


# Recompute the RTPs, but only performing the calculation on the "Count" column
rtp_table = f_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak)
rtp_table


# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns = {"Count": "Count RTP"})
rtp_table


# What name has fallen the most in popularity?
rtp_table.sort_values("Count RTP")


def plot_name(*names):
    return px.line(f_babynames[f_babynames["Name"].isin(names)], 
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")

plot_name("Debra")


# Find the 10 names that have decreased the most in popularity
top10 = rtp_table.sort_values("Count RTP").head(10).index
top10

Index(['Debra', 'Susan', 'Debbie', 'Cheryl', 'Carol', 'Tammy', 'Terri',
       'Shannon', 'Deborah', 'Carolyn'],
      dtype='object', name='Name')


plot_name(*top10)


# Recall that we are only considering name entries marked with sex "F"
plot_name("Bella", "Dominic")


babynames.iloc[[0, 233, 484], [3, 4]]


babynames.loc[[0, 233, 484]]


babynames.loc[babynames["Count"] > 250, ["Name", "Count"]].head(3)


babynames.loc[babynames["Count"]>250, ["Name", "Count"]].iloc[0:2, :]


babynames.groupby("Name")[["Count"]].sum()


babynames.groupby("Year")[["Count"]].sum()

	State	Sex	Year	Name	Count
0	CA	F	1910	Mary	295
1	CA	F	1910	Helen	239
2	CA	F	1910	Dorothy	220
3	CA	F	1910	Margaret	163
4	CA	F	1910	Frances	134
5	CA	F	1910	Ruth	128
6	CA	F	1910	Evelyn	126
7	CA	F	1910	Alice	118
8	CA	F	1910	Virginia	101
9	CA	F	1910	Elizabeth	93

Symbol	Usage	Meaning
~	~p	Returns negation of p
\|	p \| q	p OR q
&	p & q	p AND q
^	p ^ q	p XOR q (exclusive or)

	Year	Count
count	400762.000000	400762.000000
mean	1985.131287	79.953781
std	26.821004	295.414618
min	1910.000000	5.000000
25%	1968.000000	7.000000
50%	1991.000000	13.000000
75%	2007.000000	38.000000
max	2021.000000	8262.000000

	State	Sex	Year	Name	Count
240444	CA	M	1922	Randall	8
18310	CA	F	1941	Jacklyn	14
79985	CA	F	1978	China	7
125679	CA	F	1993	June	12
360575	CA	M	2008	Ramiro	113

	Count
Year
1910	9163
1911	9983
1912	17946
1913	22094
1914	26926
...	...
2017	410835
2018	395151
2019	386504
2020	362180
2021	359997

Pandas, Part II¶

Dataset: California baby names¶

Conditional Selection¶

Adding, Removing, and Modifying Columns¶

Useful Utility Functions¶

`NumPy`¶

Built-In `pandas` Methods¶

Grouping¶

Case Study: Name "Popularity"¶

Slido Exercises¶

	State	Sex	Year	Name	Count
6289	CA	F	1923	Bella	5
7512	CA	F	1925	Bella	8
12368	CA	F	1932	Lisa	5
14741	CA	F	1936	Lisa	8
17084	CA	F	1939	Lisa	5
...	...	...	...	...	...
386576	CA	M	2017	Alex	482
389498	CA	M	2018	Alex	494
392360	CA	M	2019	Alex	436
395230	CA	M	2020	Alex	378
398031	CA	M	2021	Alex	331

	State	Sex	Year	Name	Count
76	CA	F	1910	Norma	23
83	CA	F	1910	Nellie	20
127	CA	F	1910	Nina	11
198	CA	F	1910	Nora	6
310	CA	F	1911	Nellie	23
...	...	...	...	...	...
400648	CA	M	2021	Nirvan	5
400649	CA	M	2021	Nivin	5
400650	CA	M	2021	Nolen	5
400651	CA	M	2021	Nomar	5
400652	CA	M	2021	Nyles	5

	State	Sex	Year	Name	Count
152417	CA	F	2000	Deven	5
151039	CA	F	2000	Rosy	11
151543	CA	F	2000	Kimberlyn	8
152643	CA	F	2000	Mikela	5

	State	Sex	Year	Name	Count
263272	CA	M	1956	Michael	8262
264297	CA	M	1957	Michael	8250
313644	CA	M	1990	Michael	8247
278109	CA	M	1969	Michael	8244
279405	CA	M	1970	Michael	8197
...	...	...	...	...	...
159967	CA	F	2002	Arista	5
159966	CA	F	2002	Arisbeth	5
159965	CA	F	2002	Arisa	5
159964	CA	F	2002	Arionna	5
400761	CA	M	2021	Zyrus	5

	Year
Name
Aadan	2008
Aadarsh	2019
Aaden	2007
Aadhav	2014
Aadhira	2017
...	...
Zymir	2020
Zyon	1999
Zyra	2012
Zyrah	2011
Zyrus	2021

	Year	Count
Name
Aadhira	1.0	0.700000
Aadhya	1.0	0.580000
Aadya	1.0	0.724138
Aahana	1.0	0.192308
Aahna	1.0	1.000000
...	...	...
Zyanya	1.0	0.857143
Zyla	1.0	1.000000
Zylah	1.0	1.000000
Zyra	1.0	1.000000
Zyrah	1.0	0.833333

	Count RTP
Name
Debra	0.001260
Susan	0.002034
Debbie	0.002817
Cheryl	0.003273
Carol	0.003635
...	...
Jovi	1.000000
Neta	1.000000
Doni	1.000000
Dondi	1.000000
Kela	1.000000

Pandas, Part II¶

Dataset: California baby names¶

Conditional Selection¶

Adding, Removing, and Modifying Columns¶

Useful Utility Functions¶

NumPy¶

Built-In pandas Methods¶

Grouping¶

Case Study: Name "Popularity"¶

Slido Exercises¶

`NumPy`¶

Built-In `pandas` Methods¶