By Josh Hug and Narges Norouzi
Advanced Pandas syntax and aggregation.
import numpy as np
import pandas as pd
import plotly.express as px
Let's load the California baby names again.
import urllib.request
import os.path
import zipfile
data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
f.write(resp.read())
zf = zipfile.ZipFile(local_filename, 'r')
ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
babynames = pd.read_csv(fh, header=None, names=field_names)
babynames.head()
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
# Ask yourself: why is :9 is the correct slice to select the first 10 rows?
babynames_first_10_rows = babynames.loc[:9, :]
babynames_first_10_rows
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
5 | CA | F | 1910 | Ruth | 128 |
6 | CA | F | 1910 | Evelyn | 126 |
7 | CA | F | 1910 | Alice | 118 |
8 | CA | F | 1910 | Virginia | 101 |
9 | CA | F | 1910 | Elizabeth | 93 |
# Notice how we have exactly 10 elements in our boolean array argument
babynames_first_10_rows[[True, False, True, False, True, False, True, False, True, False]]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
2 | CA | F | 1910 | Dorothy | 220 |
4 | CA | F | 1910 | Frances | 134 |
6 | CA | F | 1910 | Evelyn | 126 |
8 | CA | F | 1910 | Virginia | 101 |
# First, use a logical condition to generate a boolean array
logical_operator = (babynames["Sex"] == "F")
logical_operator
0 True 1 True 2 True 3 True 4 True ... 400757 False 400758 False 400759 False 400760 False 400761 False Name: Sex, Length: 400762, dtype: bool
# Then, use this boolean array to filter the DataFrame
babynames[logical_operator]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
235786 | CA | F | 2021 | Zarahi | 5 |
235787 | CA | F | 2021 | Zelia | 5 |
235788 | CA | F | 2021 | Zenobia | 5 |
235789 | CA | F | 2021 | Zeppelin | 5 |
235790 | CA | F | 2021 | Zoraya | 5 |
235791 rows × 5 columns
Boolean array selection also works with loc
!
babynames.loc[babynames["Sex"] == "F"]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
235786 | CA | F | 2021 | Zarahi | 5 |
235787 | CA | F | 2021 | Zelia | 5 |
235788 | CA | F | 2021 | Zenobia | 5 |
235789 | CA | F | 2021 | Zeppelin | 5 |
235790 | CA | F | 2021 | Zoraya | 5 |
235791 rows × 5 columns
babynames[(babynames["Sex"] == "F") & (babynames["Year"] < 2000)]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
149044 | CA | F | 1999 | Zareen | 5 |
149045 | CA | F | 1999 | Zeinab | 5 |
149046 | CA | F | 1999 | Zhane | 5 |
149047 | CA | F | 1999 | Zoha | 5 |
149048 | CA | F | 1999 | Zoila | 5 |
149049 rows × 5 columns
(
babynames[(babynames["Name"] == "Bella") |
(babynames["Name"] == "Alex") |
(babynames["Name"] == "Ani") |
(babynames["Name"] == "Lisa")]
)
# Note: The parentheses surrounding the code make it possible to break the code on to multiple lines for readability
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
6289 | CA | F | 1923 | Bella | 5 |
7512 | CA | F | 1925 | Bella | 8 |
12368 | CA | F | 1932 | Lisa | 5 |
14741 | CA | F | 1936 | Lisa | 8 |
17084 | CA | F | 1939 | Lisa | 5 |
... | ... | ... | ... | ... | ... |
386576 | CA | M | 2017 | Alex | 482 |
389498 | CA | M | 2018 | Alex | 494 |
392360 | CA | M | 2019 | Alex | 436 |
395230 | CA | M | 2020 | Alex | 378 |
398031 | CA | M | 2021 | Alex | 331 |
359 rows × 5 columns
names = ["Bella", "Alex", "Ani", "Lisa"]
babynames[babynames["Name"].isin(names)]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
6289 | CA | F | 1923 | Bella | 5 |
7512 | CA | F | 1925 | Bella | 8 |
12368 | CA | F | 1932 | Lisa | 5 |
14741 | CA | F | 1936 | Lisa | 8 |
17084 | CA | F | 1939 | Lisa | 5 |
... | ... | ... | ... | ... | ... |
386576 | CA | M | 2017 | Alex | 482 |
389498 | CA | M | 2018 | Alex | 494 |
392360 | CA | M | 2019 | Alex | 436 |
395230 | CA | M | 2020 | Alex | 378 |
398031 | CA | M | 2021 | Alex | 331 |
359 rows × 5 columns
babynames[babynames["Name"].str.startswith("N")]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
76 | CA | F | 1910 | Norma | 23 |
83 | CA | F | 1910 | Nellie | 20 |
127 | CA | F | 1910 | Nina | 11 |
198 | CA | F | 1910 | Nora | 6 |
310 | CA | F | 1911 | Nellie | 23 |
... | ... | ... | ... | ... | ... |
400648 | CA | M | 2021 | Nirvan | 5 |
400649 | CA | M | 2021 | Nivin | 5 |
400650 | CA | M | 2021 | Nolen | 5 |
400651 | CA | M | 2021 | Nomar | 5 |
400652 | CA | M | 2021 | Nyles | 5 |
11994 rows × 5 columns
bella_counts = babynames[babynames["Name"] == "Bella"]["Count"]
bella_counts
6289 5 7512 8 35477 5 54487 7 58451 6 68845 6 73387 5 93601 5 96397 5 108054 7 111276 8 114677 10 117991 14 121524 17 125545 13 128946 18 132163 31 136362 15 139366 28 142917 27 146251 39 149607 65 153241 97 156955 122 160707 191 164586 213 168557 310 172646 334 176836 384 181090 439 185287 699 189455 902 193562 777 197554 761 201650 807 205629 704 209653 645 213592 643 217451 719 221207 747 224905 720 228576 566 232200 494 Name: Count, dtype: int64
# Average number of babies named Bella each year
np.mean(bella_counts)
270.1860465116279
# Max number of babies named Bella born on a given year
max(bella_counts)
902
babynames
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
400757 | CA | M | 2021 | Zyan | 5 |
400758 | CA | M | 2021 | Zyion | 5 |
400759 | CA | M | 2021 | Zyire | 5 |
400760 | CA | M | 2021 | Zylo | 5 |
400761 | CA | M | 2021 | Zyrus | 5 |
400762 rows × 5 columns
babynames.shape
(400762, 5)
babynames.size
2003810
babynames.describe()
Year | Count | |
---|---|---|
count | 400762.000000 | 400762.000000 |
mean | 1985.131287 | 79.953781 |
std | 26.821004 | 295.414618 |
min | 1910.000000 | 5.000000 |
25% | 1968.000000 | 7.000000 |
50% | 1991.000000 | 13.000000 |
75% | 2007.000000 | 38.000000 |
max | 2021.000000 | 8262.000000 |
babynames["Sex"].describe()
count 400762 unique 2 top F freq 235791 Name: Sex, dtype: object
babynames.sample()
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
102846 | CA | F | 1987 | Olivia | 240 |
babynames.sample(5).iloc[:, 2:]
Year | Name | Count | |
---|---|---|---|
319933 | 1992 | Ramses | 12 |
115952 | 1990 | Deeanna | 5 |
121139 | 1992 | Annabel | 27 |
30168 | 1952 | Rosa | 170 |
192668 | 2010 | Cristy | 6 |
babynames[babynames["Year"] == 2000].sample(4, replace = True).iloc[:, 2:]
Year | Name | Count | |
---|---|---|---|
339038 | 2000 | Kristopher | 108 |
149530 | 2000 | Emilia | 82 |
341134 | 2000 | Patric | 5 |
152125 | 2000 | Lindy | 6 |
babynames["Name"].value_counts()
Jean 221 Francis 219 Guadalupe 216 Jessie 215 Marion 213 ... Janin 1 Jilliann 1 Jomayra 1 Karess 1 Zyrus 1 Name: Name, Length: 20239, dtype: int64
babynames["Name"].unique()
array(['Mary', 'Helen', 'Dorothy', ..., 'Zyire', 'Zylo', 'Zyrus'], dtype=object)
babynames["Name"].sort_values()
380256 Aadan 362255 Aadan 365374 Aadan 394460 Aadarsh 366561 Aaden ... 232144 Zyrah 217415 Zyrah 197519 Zyrah 220674 Zyrah 400761 Zyrus Name: Name, Length: 400762, dtype: object
babynames.sort_values(by = "Count", ascending = False)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
263272 | CA | M | 1956 | Michael | 8262 |
264297 | CA | M | 1957 | Michael | 8250 |
313644 | CA | M | 1990 | Michael | 8247 |
278109 | CA | M | 1969 | Michael | 8244 |
279405 | CA | M | 1970 | Michael | 8197 |
... | ... | ... | ... | ... | ... |
159967 | CA | F | 2002 | Arista | 5 |
159966 | CA | F | 2002 | Arisbeth | 5 |
159965 | CA | F | 2002 | Arisa | 5 |
159964 | CA | F | 2002 | Arionna | 5 |
400761 | CA | M | 2021 | Zyrus | 5 |
400762 rows × 5 columns
Note: the outer parentheses in the code below aren't strictly necessary, but they make it valid syntax to break the chained method calls in separate lines, which helps readability. The example below finds the top 5 most popular names in California in 2021.
# Sort names by count in year 2021
(
babynames[babynames["Year"] == 2021]
.sort_values("Count", ascending = False)
.head()
)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
397909 | CA | M | 2021 | Noah | 2591 |
397910 | CA | M | 2021 | Liam | 2469 |
232145 | CA | F | 2021 | Olivia | 2395 |
232146 | CA | F | 2021 | Emma | 2171 |
397911 | CA | M | 2021 | Mateo | 2108 |
babynames.sort_values("Name", ascending = False)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
400761 | CA | M | 2021 | Zyrus | 5 |
197519 | CA | F | 2011 | Zyrah | 5 |
232144 | CA | F | 2020 | Zyrah | 5 |
217415 | CA | F | 2016 | Zyrah | 5 |
220674 | CA | F | 2017 | Zyrah | 6 |
... | ... | ... | ... | ... | ... |
360532 | CA | M | 2008 | Aaden | 135 |
394460 | CA | M | 2019 | Aadarsh | 6 |
380256 | CA | M | 2014 | Aadan | 5 |
362255 | CA | M | 2008 | Aadan | 7 |
365374 | CA | M | 2009 | Aadan | 6 |
400762 rows × 5 columns
# Here, a lambda function is applied to find the length of each value, `x`, in the "Name" column
babynames.sort_values("Name", key=lambda x: x.str.len(), ascending = False).head(5)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
313143 | CA | M | 1989 | Franciscojavier | 6 |
333732 | CA | M | 1997 | Ryanchristopher | 5 |
330421 | CA | M | 1996 | Franciscojavier | 8 |
323615 | CA | M | 1993 | Johnchristopher | 5 |
310235 | CA | M | 1988 | Franciscojavier | 10 |
# Create a Series of the length of each name
babyname_lengths = babynames["Name"].str.len()
# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babyname_lengths
babynames.head(5)
State | Sex | Year | Name | Count | name_lengths | |
---|---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 | 4 |
1 | CA | F | 1910 | Helen | 239 | 5 |
2 | CA | F | 1910 | Dorothy | 220 | 7 |
3 | CA | F | 1910 | Margaret | 163 | 8 |
4 | CA | F | 1910 | Frances | 134 | 7 |
# Sort by the temporary column
babynames = babynames.sort_values(by = "name_lengths", ascending=False)
babynames.head(5)
State | Sex | Year | Name | Count | name_lengths | |
---|---|---|---|---|---|---|
313143 | CA | M | 1989 | Franciscojavier | 6 | 15 |
333732 | CA | M | 1997 | Ryanchristopher | 5 | 15 |
330421 | CA | M | 1996 | Franciscojavier | 8 | 15 |
323615 | CA | M | 1993 | Johnchristopher | 5 | 15 |
310235 | CA | M | 1988 | Franciscojavier | 10 | 15 |
# Drop the `name_length` column
babynames = babynames.drop("name_lengths", axis = 'columns')
babynames.head(5)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
313143 | CA | M | 1989 | Franciscojavier | 6 |
333732 | CA | M | 1997 | Ryanchristopher | 5 |
330421 | CA | M | 1996 | Franciscojavier | 8 |
323615 | CA | M | 1993 | Johnchristopher | 5 |
310235 | CA | M | 1988 | Franciscojavier | 10 |
We can also use the Python map function if we want to use an arbitrarily defined function. Suppose we want to sort by the number of occurrences of "dr" plus the number of occurences of "ea".
# First, define a function to count the number of times "dr" or "ea" appear in each name
def dr_ea_count(string):
return string.count('dr') + string.count('ea')
# Then, use `map` to apply `dr_ea_count` to each name in the "Name" column
babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)
# Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork
babynames = babynames.sort_values(by = "dr_ea_count", ascending=False)
babynames.head()
State | Sex | Year | Name | Count | dr_ea_count | |
---|---|---|---|---|---|---|
101969 | CA | F | 1986 | Deandrea | 6 | 3 |
304390 | CA | M | 1985 | Deandrea | 6 | 3 |
131022 | CA | F | 1994 | Leandrea | 5 | 3 |
115950 | CA | F | 1990 | Deandrea | 5 | 3 |
108723 | CA | F | 1988 | Deandrea | 5 | 3 |
# Drop the `dr_ea_count` column
babynames = babynames.drop("dr_ea_count", axis = 'columns')
babynames.head(5)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
101969 | CA | F | 1986 | Deandrea | 6 |
304390 | CA | M | 1985 | Deandrea | 6 |
131022 | CA | F | 1994 | Leandrea | 5 |
115950 | CA | F | 1990 | Deandrea | 5 |
108723 | CA | F | 1988 | Deandrea | 5 |
In this exercise, let's find the female name whose popularity has dropped the most since its peak. As an example of a name that has fallen into disfavor, consider "Jennifer", visualized below.
Note: We won't cover plotly in lecture until after Lisa covers EDA and Regex.
Since we're only working with female names, let's create a DataFrame with only female names to simplify our later code.
female_babynames = babynames[babynames["Sex"] == "F"]
female_babynames
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
101969 | CA | F | 1986 | Deandrea | 6 |
131022 | CA | F | 1994 | Leandrea | 5 |
115950 | CA | F | 1990 | Deandrea | 5 |
108723 | CA | F | 1988 | Deandrea | 5 |
204117 | CA | F | 2013 | Alexandrea | 8 |
... | ... | ... | ... | ... | ... |
211639 | CA | F | 2015 | Aislin | 11 |
211803 | CA | F | 2015 | Daliah | 10 |
211642 | CA | F | 2015 | Alizay | 11 |
211807 | CA | F | 2015 | Divine | 10 |
208703 | CA | F | 2014 | Chloey | 6 |
235791 rows × 5 columns
fig = px.line(female_babynames[female_babynames["Name"] == "Jennifer"],
x = "Year", y = "Count")
fig.update_layout(font_size = 18)