import numpy as np
import pandas as pd
import plotly.express as px


import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()


# Ask yourself: why is :9 is the correct slice to select the first 10 rows?
babynames_first_10_rows = babynames.loc[:9, :]

babynames_first_10_rows


# Notice how we have exactly 10 elements in our boolean array argument
babynames_first_10_rows[[True, False, True, False, True, False, True, False, True, False]]


# First, use a logical condition to generate a boolean array
logical_operator = (babynames["Sex"] == "F")
logical_operator

0          True
1          True
2          True
3          True
4          True
          ...  
400757    False
400758    False
400759    False
400760    False
400761    False
Name: Sex, Length: 400762, dtype: bool


# Then, use this boolean array to filter the DataFrame
babynames[logical_operator]


babynames.loc[babynames["Sex"] == "F"]


babynames[(babynames["Sex"] == "F") & (babynames["Year"] < 2000)]


(
    babynames[(babynames["Name"] == "Bella") | 
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Ani") |
              (babynames["Name"] == "Lisa")]
)
# Note: The parentheses surrounding the code make it possible to break the code on to multiple lines for readability


names = ["Bella", "Alex", "Ani", "Lisa"]
babynames[babynames["Name"].isin(names)]


babynames[babynames["Name"].str.startswith("N")]


bella_counts = babynames[babynames["Name"] == "Bella"]["Count"]
bella_counts

6289        5
7512        8
35477       5
54487       7
58451       6
68845       6
73387       5
93601       5
96397       5
108054      7
111276      8
114677     10
117991     14
121524     17
125545     13
128946     18
132163     31
136362     15
139366     28
142917     27
146251     39
149607     65
153241     97
156955    122
160707    191
164586    213
168557    310
172646    334
176836    384
181090    439
185287    699
189455    902
193562    777
197554    761
201650    807
205629    704
209653    645
213592    643
217451    719
221207    747
224905    720
228576    566
232200    494
Name: Count, dtype: int64


# Average number of babies named Bella each year

np.mean(bella_counts)

270.1860465116279


# Max number of babies named Bella born on a given year

max(bella_counts)

902


babynames


babynames.shape

(400762, 5)


babynames.size

2003810


babynames.describe()


babynames["Sex"].describe()

count     400762
unique         2
top            F
freq      235791
Name: Sex, dtype: object


babynames.sample()


babynames.sample(5).iloc[:, 2:]


babynames[babynames["Year"] == 2000].sample(4, replace = True).iloc[:, 2:]


babynames["Name"].value_counts()

Jean         221
Francis      219
Guadalupe    216
Jessie       215
Marion       213
            ... 
Janin          1
Jilliann       1
Jomayra        1
Karess         1
Zyrus          1
Name: Name, Length: 20239, dtype: int64


babynames["Name"].unique()

array(['Mary', 'Helen', 'Dorothy', ..., 'Zyire', 'Zylo', 'Zyrus'],
      dtype=object)


babynames["Name"].sort_values()

380256      Aadan
362255      Aadan
365374      Aadan
394460    Aadarsh
366561      Aaden
           ...   
232144      Zyrah
217415      Zyrah
197519      Zyrah
220674      Zyrah
400761      Zyrus
Name: Name, Length: 400762, dtype: object


babynames.sort_values(by = "Count", ascending = False)


# Sort names by count in year 2021
(
    babynames[babynames["Year"] == 2021]
    .sort_values("Count", ascending = False)
    .head()
)


babynames.sort_values("Name", ascending = False)


# Here, a lambda function is applied to find the length of each value, `x`, in the "Name" column

babynames.sort_values("Name", key=lambda x: x.str.len(), ascending = False).head(5)


# Create a Series of the length of each name
babyname_lengths = babynames["Name"].str.len()

# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babyname_lengths
babynames.head(5)


# Sort by the temporary column
babynames = babynames.sort_values(by = "name_lengths", ascending=False)
babynames.head(5)


# Drop the `name_length` column
babynames = babynames.drop("name_lengths", axis = 'columns')
babynames.head(5)


# First, define a function to count the number of times "dr" or "ea" appear in each name
def dr_ea_count(string):
    return string.count('dr') + string.count('ea')

# Then, use `map` to apply `dr_ea_count` to each name in the "Name" column
babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)

# Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork
babynames = babynames.sort_values(by = "dr_ea_count", ascending=False)
babynames.head()


# Drop the `dr_ea_count` column
babynames = babynames.drop("dr_ea_count", axis = 'columns')
babynames.head(5)


female_babynames = babynames[babynames["Sex"] == "F"]
female_babynames


fig = px.line(female_babynames[female_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18)


female_babynames = female_babynames.sort_values(["Year", "Count"])
female_babynames.head()


fig = px.line(female_babynames[female_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18)


max_jenn = max(female_babynames[female_babynames["Name"] == "Jennifer"]["Count"])
max_jenn

6065


curr_jenn = female_babynames[female_babynames["Name"] == "Jennifer"]["Count"].iloc[-1]
curr_jenn

91


curr_jenn / max_jenn

0.015004122011541632


def ratio_to_peak(series):
    return series.iloc[-1] / max(series)


jenn_counts_ser = female_babynames[female_babynames["Name"] == "Jennifer"]["Count"]
ratio_to_peak(jenn_counts_ser)

0.015004122011541632


ratio_to_peak(female_babynames[female_babynames["Name"] == "Jessica"]["Count"])

0.01524741081703107


%%time
# Build dictionary where entry i is the ammd for the given name
# e.g. rtps["jennifer"] should be 0.01500
rtps = {}
for name in female_babynames["Name"].unique()[0:100]:
    counts_of_current_name = female_babynames[female_babynames["Name"] == name]["Count"]
    if counts_of_current_name.size > 0:
        rtps[name] = ratio_to_peak(counts_of_current_name)
    
# Convert to series
rtps = pd.Series(rtps) 
rtps

CPU times: user 1.51 s, sys: 4.14 ms, total: 1.51 s
Wall time: 1.51 s

Althea      0.285714
Adrienne    0.141509
Peggy       0.015674
Mable       0.192308
Lydia       0.567460
              ...   
Matilda     0.736264
Sadie       0.549261
Gloria      0.074777
Helene      0.138889
Minnie      0.121951
Length: 100, dtype: float64


rtps.sort_values()

Carol      0.003635
Carolyn    0.005753
Janet      0.010283
Shirley    0.014567
Peggy      0.015674
             ...   
Faye       0.913793
Nora       0.972973
Goldie     1.000000
Iris       1.000000
Estelle    1.000000
Length: 100, dtype: float64


female_babynames.head()


%%time
rtp_table = female_babynames.groupby("Name").agg(ratio_to_peak)
rtp_table
# Note: If this cell crashes, comment out the code and use the female_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak) instead

CPU times: user 818 ms, sys: 3.76 ms, total: 821 ms
Wall time: 821 ms

<timed exec>:1: FutureWarning:

['State', 'Sex'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.


rtp_table = female_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak)
rtp_table


# This code renames the Count column to RTP. You'll see this syntax in lab
rtp_table = rtp_table.rename(columns = {"Count": "Count RTP"})
rtp_table


rtp_table.sort_values("Count RTP")


def plot_name(*names):
    return px.line(female_babynames[female_babynames["Name"].isin(names)], 
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")

plot_name("Debra")


top10 = rtp_table.sort_values("Count RTP").head(10).index
top10

Index(['Debra', 'Susan', 'Debbie', 'Cheryl', 'Carol', 'Tammy', 'Terri',
       'Shannon', 'Deborah', 'Carolyn'],
      dtype='object', name='Name')


plot_name(*top10)


plot_name("Lucia", "Jennifer")


puzzle1 = female_babynames.groupby("Name")[["Count"]].agg(sum)
puzzle1


puzzle2 = female_babynames.groupby("Year")[["Count"]].agg(sum)
puzzle2


puzzle2 = female_babynames.groupby("Year")[["Count"]].sum()


fig = px.line(puzzle2, y = "Count")
fig.update_layout(font_size = 15)


# Not covered in lecture but we can also compute relative births
relative_births = puzzle2 / max(puzzle2["Count"])


fig = px.line(relative_births, y = "Count")
fig.update_layout(font_size = 15)

	Year	Count
count	400762.000000	400762.000000
mean	1985.131287	79.953781
std	26.821004	295.414618
min	1910.000000	5.000000
25%	1968.000000	7.000000
50%	1991.000000	13.000000
75%	2007.000000	38.000000
max	2021.000000	8262.000000

	Year	Name	Count
319933	1992	Ramses	12
115952	1990	Deeanna	5
121139	1992	Annabel	27
30168	1952	Rosa	170
192668	2010	Cristy	6

	State	Sex	Year	Name	Count
263272	CA	M	1956	Michael	8262
264297	CA	M	1957	Michael	8250
313644	CA	M	1990	Michael	8247
278109	CA	M	1969	Michael	8244
279405	CA	M	1970	Michael	8197
...	...	...	...	...	...
159967	CA	F	2002	Arista	5
159966	CA	F	2002	Arisbeth	5
159965	CA	F	2002	Arisa	5
159964	CA	F	2002	Arionna	5
400761	CA	M	2021	Zyrus	5

	State	Sex	Year	Name	Count
313143	CA	M	1989	Franciscojavier	6
333732	CA	M	1997	Ryanchristopher	5
330421	CA	M	1996	Franciscojavier	8
323615	CA	M	1993	Johnchristopher	5
310235	CA	M	1988	Franciscojavier	10

	State	Sex	Year	Name	Count
313143	CA	M	1989	Franciscojavier	6
333732	CA	M	1997	Ryanchristopher	5
330421	CA	M	1996	Franciscojavier	8
323615	CA	M	1993	Johnchristopher	5
310235	CA	M	1988	Franciscojavier	10

Introduction to Pandas, Part 2¶

Dataset - California baby names¶

Conditional Selection¶

An alternate approach is to create a temporary column corresponding to the length¶

Female Name whose popularity has dropped the most.¶

Approach 1: Naive For Loop¶

Approach 2: Use groupby.agg¶

Some Additional Groupby Puzzles¶

	State	Sex	Year	Name	Count
0	CA	F	1910	Mary	295
1	CA	F	1910	Helen	239
2	CA	F	1910	Dorothy	220
3	CA	F	1910	Margaret	163
4	CA	F	1910	Frances	134

	State	Sex	Year	Name	Count
6289	CA	F	1923	Bella	5
7512	CA	F	1925	Bella	8
12368	CA	F	1932	Lisa	5
14741	CA	F	1936	Lisa	8
17084	CA	F	1939	Lisa	5
...	...	...	...	...	...
386576	CA	M	2017	Alex	482
389498	CA	M	2018	Alex	494
392360	CA	M	2019	Alex	436
395230	CA	M	2020	Alex	378
398031	CA	M	2021	Alex	331

	State	Sex	Year	Name	Count
76	CA	F	1910	Norma	23
83	CA	F	1910	Nellie	20
127	CA	F	1910	Nina	11
198	CA	F	1910	Nora	6
310	CA	F	1911	Nellie	23
...	...	...	...	...	...
400648	CA	M	2021	Nirvan	5
400649	CA	M	2021	Nivin	5
400650	CA	M	2021	Nolen	5
400651	CA	M	2021	Nomar	5
400652	CA	M	2021	Nyles	5

	Year	Name	Count
339038	2000	Kristopher	108
149530	2000	Emilia	82
341134	2000	Patric	5
152125	2000	Lindy	6

	State	Sex	Year	Name	Count
397909	CA	M	2021	Noah	2591
397910	CA	M	2021	Liam	2469
232145	CA	F	2021	Olivia	2395
232146	CA	F	2021	Emma	2171
397911	CA	M	2021	Mateo	2108

	State	Sex	Year	Name	Count	dr_ea_count
101969	CA	F	1986	Deandrea	6	3
304390	CA	M	1985	Deandrea	6	3
131022	CA	F	1994	Leandrea	5	3
115950	CA	F	1990	Deandrea	5	3
108723	CA	F	1988	Deandrea	5	3

	State	Sex	Year	Name	Count
206	CA	F	1910	Althea	5
205	CA	F	1910	Adrienne	5
227	CA	F	1910	Peggy	5
224	CA	F	1910	Mable	5
223	CA	F	1910	Lydia	5

	Year	Count
Name
Aadhira	1.0	0.700000
Aadhya	1.0	0.580000
Aadya	1.0	0.724138
Aahana	1.0	0.192308
Aahna	1.0	1.000000
...	...	...
Zyanya	1.0	0.857143
Zyla	1.0	1.000000
Zylah	1.0	1.000000
Zyra	1.0	1.000000
Zyrah	1.0	0.833333

	Count RTP
Name
Debra	0.001260
Susan	0.002034
Debbie	0.002817
Cheryl	0.003273
Carol	0.003635
...	...
Jovi	1.000000
Neta	1.000000
Doni	1.000000
Dondi	1.000000
Kela	1.000000

	Count
Year
1910	5950
1911	6602
1912	9804
1913	11860
1914	13815
...	...
2017	195466
2018	189066
2019	183982
2020	173428
2021	172693