Adapted from Josh Hug and Narges Norouzi
Updated by Bella Crouch
A demonstration of advanced pandas
syntax and aggregation to accompany Lecture 3.
import numpy as np
import pandas as pd
import plotly.express as px
In today's lecture, we'll work with the babynames
dataset, which contains information about the names of infants born in California.
The cell below pulls census data from a government website, then loads it into a usable form. The code shown here is outside of the scope of Data 100, but you're encouraged to dig into it if you are interested!
import urllib.request
import os.path
import zipfile
data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "babynamesbystate.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
f.write(resp.read())
zf = zipfile.ZipFile(local_filename, 'r')
ca_name = 'CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
babynames = pd.read_csv(fh, header=None, names=field_names)
babynames.head()
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
# Ask yourself: why is :9 is the correct slice to select the first 10 rows?
babynames_first_10_rows = babynames.loc[:9, :]
babynames_first_10_rows
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
5 | CA | F | 1910 | Ruth | 128 |
6 | CA | F | 1910 | Evelyn | 126 |
7 | CA | F | 1910 | Alice | 118 |
8 | CA | F | 1910 | Virginia | 101 |
9 | CA | F | 1910 | Elizabeth | 93 |
By passing in a sequence (list, array, or Series
) of boolean values, we can extract a subset of the rows in a DataFrame
. We will keep only the rows that correspond to a boolean value of True
.
# Notice how we have exactly 10 elements in our boolean array argument
babynames_first_10_rows[[True, False, True, False, True, False, True, False, True, False]]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
2 | CA | F | 1910 | Dorothy | 220 |
4 | CA | F | 1910 | Frances | 134 |
6 | CA | F | 1910 | Evelyn | 126 |
8 | CA | F | 1910 | Virginia | 101 |
Oftentimes, we'll use boolean selection to check for entries in a DataFrame
that meet a particular condition.
# First, use a logical condition to generate a boolean Series
logical_operator = babynames["Sex"] == "F"
logical_operator
0 True 1 True 2 True 3 True 4 True ... 400757 False 400758 False 400759 False 400760 False 400761 False Name: Sex, Length: 400762, dtype: bool
# Then, use this boolean Series to filter the DataFrame
babynames[logical_operator]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
235786 | CA | F | 2021 | Zarahi | 5 |
235787 | CA | F | 2021 | Zelia | 5 |
235788 | CA | F | 2021 | Zenobia | 5 |
235789 | CA | F | 2021 | Zeppelin | 5 |
235790 | CA | F | 2021 | Zoraya | 5 |
235791 rows × 5 columns
Boolean selection also works with loc
!
# Notice that we did not have to specify columns to select
# If no columns are referenced, pandas will automatically select all columns
babynames.loc[babynames["Sex"] == "F"]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
235786 | CA | F | 2021 | Zarahi | 5 |
235787 | CA | F | 2021 | Zelia | 5 |
235788 | CA | F | 2021 | Zenobia | 5 |
235789 | CA | F | 2021 | Zeppelin | 5 |
235790 | CA | F | 2021 | Zoraya | 5 |
235791 rows × 5 columns
To filter on multiple conditions, we combine boolean operators using bitwise comparisons.
Symbol | Usage | Meaning |
---|---|---|
~ | ~p | Returns negation of p |
| | p | q | p OR q |
& | p & q | p AND q |
^ | p ^ q | p XOR q (exclusive or) |
babynames[(babynames["Sex"] == "F") & (babynames["Year"] < 2000)]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
149044 | CA | F | 1999 | Zareen | 5 |
149045 | CA | F | 1999 | Zeinab | 5 |
149046 | CA | F | 1999 | Zhane | 5 |
149047 | CA | F | 1999 | Zoha | 5 |
149048 | CA | F | 1999 | Zoila | 5 |
149049 rows × 5 columns
# Note: The parentheses surrounding the code make it possible to break the code on to multiple lines for readability
(
babynames[(babynames["Name"] == "Bella") |
(babynames["Name"] == "Alex") |
(babynames["Name"] == "Ani") |
(babynames["Name"] == "Lisa")]
)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
6289 | CA | F | 1923 | Bella | 5 |
7512 | CA | F | 1925 | Bella | 8 |
12368 | CA | F | 1932 | Lisa | 5 |
14741 | CA | F | 1936 | Lisa | 8 |
17084 | CA | F | 1939 | Lisa | 5 |
... | ... | ... | ... | ... | ... |
386576 | CA | M | 2017 | Alex | 482 |
389498 | CA | M | 2018 | Alex | 494 |
392360 | CA | M | 2019 | Alex | 436 |
395230 | CA | M | 2020 | Alex | 378 |
398031 | CA | M | 2021 | Alex | 331 |
359 rows × 5 columns
# A more concise method to achieve the above: .isin
names = ["Bella", "Alex", "Ani", "Lisa"]
babynames[babynames["Name"].isin(names)]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
6289 | CA | F | 1923 | Bella | 5 |
7512 | CA | F | 1925 | Bella | 8 |
12368 | CA | F | 1932 | Lisa | 5 |
14741 | CA | F | 1936 | Lisa | 8 |
17084 | CA | F | 1939 | Lisa | 5 |
... | ... | ... | ... | ... | ... |
386576 | CA | M | 2017 | Alex | 482 |
389498 | CA | M | 2018 | Alex | 494 |
392360 | CA | M | 2019 | Alex | 436 |
395230 | CA | M | 2020 | Alex | 378 |
398031 | CA | M | 2021 | Alex | 331 |
359 rows × 5 columns
# What if we only want names that start with "N"?
babynames[babynames["Name"].str.startswith("N")]
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
76 | CA | F | 1910 | Norma | 23 |
83 | CA | F | 1910 | Nellie | 20 |
127 | CA | F | 1910 | Nina | 11 |
198 | CA | F | 1910 | Nora | 6 |
310 | CA | F | 1911 | Nellie | 23 |
... | ... | ... | ... | ... | ... |
400648 | CA | M | 2021 | Nirvan | 5 |
400649 | CA | M | 2021 | Nivin | 5 |
400650 | CA | M | 2021 | Nolen | 5 |
400651 | CA | M | 2021 | Nomar | 5 |
400652 | CA | M | 2021 | Nyles | 5 |
11994 rows × 5 columns
To add a column, use []
to reference the desired new column, then assign it to a Series or array of appropriate length.
# Create a Series of the length of each name
babyname_lengths = babynames["Name"].str.len()
# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babyname_lengths
babynames
State | Sex | Year | Name | Count | name_lengths | |
---|---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 | 4 |
1 | CA | F | 1910 | Helen | 239 | 5 |
2 | CA | F | 1910 | Dorothy | 220 | 7 |
3 | CA | F | 1910 | Margaret | 163 | 8 |
4 | CA | F | 1910 | Frances | 134 | 7 |
... | ... | ... | ... | ... | ... | ... |
400757 | CA | M | 2021 | Zyan | 5 | 4 |
400758 | CA | M | 2021 | Zyion | 5 | 5 |
400759 | CA | M | 2021 | Zyire | 5 | 5 |
400760 | CA | M | 2021 | Zylo | 5 | 4 |
400761 | CA | M | 2021 | Zyrus | 5 | 5 |
400762 rows × 6 columns
To modify a column, use []
to access the desired column, then re-assign it to a new array or Series.
# Modify the “name_lengths” column to be one less than its original value
babynames["name_lengths"] = babynames["name_lengths"]-1
babynames
State | Sex | Year | Name | Count | name_lengths | |
---|---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 | 3 |
1 | CA | F | 1910 | Helen | 239 | 4 |
2 | CA | F | 1910 | Dorothy | 220 | 6 |
3 | CA | F | 1910 | Margaret | 163 | 7 |
4 | CA | F | 1910 | Frances | 134 | 6 |
... | ... | ... | ... | ... | ... | ... |
400757 | CA | M | 2021 | Zyan | 5 | 3 |
400758 | CA | M | 2021 | Zyion | 5 | 4 |
400759 | CA | M | 2021 | Zyire | 5 | 4 |
400760 | CA | M | 2021 | Zylo | 5 | 3 |
400761 | CA | M | 2021 | Zyrus | 5 | 4 |
400762 rows × 6 columns
Rename a column using the .rename()
method.
# Rename “name_lengths” to “Length”
babynames = babynames.rename(columns={"name_lengths":"Length"})
babynames
State | Sex | Year | Name | Count | Length | |
---|---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 | 3 |
1 | CA | F | 1910 | Helen | 239 | 4 |
2 | CA | F | 1910 | Dorothy | 220 | 6 |
3 | CA | F | 1910 | Margaret | 163 | 7 |
4 | CA | F | 1910 | Frances | 134 | 6 |
... | ... | ... | ... | ... | ... | ... |
400757 | CA | M | 2021 | Zyan | 5 | 3 |
400758 | CA | M | 2021 | Zyion | 5 | 4 |
400759 | CA | M | 2021 | Zyire | 5 | 4 |
400760 | CA | M | 2021 | Zylo | 5 | 3 |
400761 | CA | M | 2021 | Zyrus | 5 | 4 |
400762 rows × 6 columns
Remove a column using .drop()
.
import numpy as np
a = np.array([True, True, False])
b = np.array([True, False, False])
a & b
array([ True, False, False])
# Remove our new "Length" column
babynames = babynames.drop("Length", axis = "columns")
babynames
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
400757 | CA | M | 2021 | Zyan | 5 |
400758 | CA | M | 2021 | Zyion | 5 |
400759 | CA | M | 2021 | Zyire | 5 |
400760 | CA | M | 2021 | Zylo | 5 |
400761 | CA | M | 2021 | Zyrus | 5 |
400762 rows × 5 columns
NumPy
¶The NumPy
functions you encountered in Data 8 are compatible with objects in pandas
.
bella_counts = babynames[babynames["Name"] == "Bella"]["Count"]
# Average number of babies named Bella each year
np.mean(bella_counts)
270.1860465116279
# Max number of babies named Bella born in any single year
max(bella_counts)
902
pandas
Methods¶There are many, many utility functions built into pandas
, far more than we can possibly cover in lecture. You are encouraged to explore all the functionality outlined in the pandas
documentation.
# Return the shape of the object, in the format (num_rows, num_columns)
babynames.shape
(400762, 5)
# Return the total number of entries in the object, equal to num_rows * num_columns
babynames.size
2003810
# What summary statistics can we describe?
babynames.describe()
Year | Count | |
---|---|---|
count | 400762.000000 | 400762.000000 |
mean | 1985.131287 | 79.953781 |
std | 26.821004 | 295.414618 |
min | 1910.000000 | 5.000000 |
25% | 1968.000000 | 7.000000 |
50% | 1991.000000 | 13.000000 |
75% | 2007.000000 | 38.000000 |
max | 2021.000000 | 8262.000000 |
# Our statistics are slightly different when working with a Series
babynames["Sex"].describe()
count 400762 unique 2 top F freq 235791 Name: Sex, dtype: object
# Randomly sample row(s) from the DataFrame
babynames.sample()
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
109970 | CA | F | 1989 | Lia | 32 |
# Rerun this cell a few times – you'll get different results!
babynames.sample(5)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
240444 | CA | M | 1922 | Randall | 8 |
18310 | CA | F | 1941 | Jacklyn | 14 |
79985 | CA | F | 1978 | China | 7 |
125679 | CA | F | 1993 | June | 12 |
360575 | CA | M | 2008 | Ramiro | 113 |
# Sampling with replacement
babynames[babynames["Year"] == 2000].sample(4, replace = True)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
152417 | CA | F | 2000 | Deven | 5 |
151039 | CA | F | 2000 | Rosy | 11 |
151543 | CA | F | 2000 | Kimberlyn | 8 |
152643 | CA | F | 2000 | Mikela | 5 |
# Count the number of times each unique value occurs in a Series
babynames["Name"].value_counts()
Jean 221 Francis 219 Guadalupe 216 Jessie 215 Marion 213 ... Janin 1 Jilliann 1 Jomayra 1 Karess 1 Zyrus 1 Name: Name, Length: 20239, dtype: int64
# Return an array of all unique values in the Series
babynames["Name"].unique()
array(['Mary', 'Helen', 'Dorothy', ..., 'Zyire', 'Zylo', 'Zyrus'], dtype=object)
# Sort a Series
babynames["Name"].sort_values()
380256 Aadan 362255 Aadan 365374 Aadan 394460 Aadarsh 366561 Aaden ... 232144 Zyrah 217415 Zyrah 197519 Zyrah 220674 Zyrah 400761 Zyrus Name: Name, Length: 400762, dtype: object
# Sort a DataFrame – there are lots of Michaels in California
babynames.sort_values(by = "Count", ascending = False)
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
263272 | CA | M | 1956 | Michael | 8262 |
264297 | CA | M | 1957 | Michael | 8250 |
313644 | CA | M | 1990 | Michael | 8247 |
278109 | CA | M | 1969 | Michael | 8244 |
279405 | CA | M | 1970 | Michael | 8197 |
... | ... | ... | ... | ... | ... |
159967 | CA | F | 2002 | Arista | 5 |
159966 | CA | F | 2002 | Arisbeth | 5 |
159965 | CA | F | 2002 | Arisa | 5 |
159964 | CA | F | 2002 | Arionna | 5 |
400761 | CA | M | 2021 | Zyrus | 5 |
400762 rows × 5 columns
Group rows that share a common feature, then aggregate data across the group.
In this example, we count the total number of babies born in each year (considering only a small subset of the data, for simplicity).
# The code below uses the full babynames dataset, which is why some numbers are different relative to the diagram
babynames.groupby("Year").agg(sum)
Count | |
---|---|
Year | |
1910 | 9163 |
1911 | 9983 |
1912 | 17946 |
1913 | 22094 |
1914 | 26926 |
... | ... |
2017 | 410835 |
2018 | 395151 |
2019 | 386504 |
2020 | 362180 |
2021 | 359997 |
112 rows × 1 columns
There are many different aggregation functions we can use, all of which are useful in different applications.
# What is the earliest year in which each name appeared?
babynames.groupby("Name")[["Year"]].agg(min)
Year | |
---|---|
Name | |
Aadan | 2008 |
Aadarsh | 2019 |
Aaden | 2007 |
Aadhav | 2014 |
Aadhira | 2017 |
... | ... |
Zymir | 2020 |
Zyon | 1999 |
Zyra | 2012 |
Zyrah | 2011 |
Zyrus | 2021 |
20239 rows × 1 columns
# What is the largest single-year count of each name?
babynames.groupby("Name")[["Count"]].agg(max)
Count | |
---|---|
Name | |
Aadan | 7 |
Aadarsh | 6 |
Aaden | 158 |
Aadhav | 8 |
Aadhira | 10 |
... | ... |
Zymir | 5 |
Zyon | 15 |
Zyra | 16 |
Zyrah | 6 |
Zyrus | 5 |
20239 rows × 1 columns
In this exercise, let's find the name with sex "F" that has dropped most in popularity since its peak usage. We'll start by filtering babynames
to only include names corresponding to sex "F".
f_babynames = babynames[babynames["Sex"] == "F"]
f_babynames
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
1 | CA | F | 1910 | Helen | 239 |
2 | CA | F | 1910 | Dorothy | 220 |
3 | CA | F | 1910 | Margaret | 163 |
4 | CA | F | 1910 | Frances | 134 |
... | ... | ... | ... | ... | ... |
235786 | CA | F | 2021 | Zarahi | 5 |
235787 | CA | F | 2021 | Zelia | 5 |
235788 | CA | F | 2021 | Zenobia | 5 |
235789 | CA | F | 2021 | Zeppelin | 5 |
235790 | CA | F | 2021 | Zoraya | 5 |
235791 rows × 5 columns
# We sort the data by year
f_babynames = f_babynames.sort_values("Year")
f_babynames
State | Sex | Year | Name | Count | |
---|---|---|---|---|---|
0 | CA | F | 1910 | Mary | 295 |
148 | CA | F | 1910 | Merle | 9 |
149 | CA | F | 1910 | Rosalie | 9 |
150 | CA | F | 1910 | Rosie | 9 |
151 | CA | F | 1910 | Teresa | 9 |
... | ... | ... | ... | ... | ... |
233364 | CA | F | 2021 | Mariajose | 22 |
233365 | CA | F | 2021 | Mariela | 22 |
233366 | CA | F | 2021 | May | 22 |
233340 | CA | F | 2021 | Dariana | 22 |
235790 | CA | F | 2021 | Zoraya | 5 |
235791 rows × 5 columns
To build our intuition on how to answer our research question, let's visualize the prevalence of the name "Jennifer" over time.
# We'll talk about how to generate plots in Lecture 7
fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
x = "Year", y = "Count")
fig.update_layout(font_size = 18)
We'll need a mathematical definition for the change in popularity of a name.
Define the metric "ratio to peak" (RTP). We'll calculate this as the count of the name in 2021 (the most recent year for which we have data) divided by the largest count of this name in any year.
A demo calculation for Jennifer:
# In the year with the highest Jennifer count, 6065 Jennifers were born
max_jenn = np.max(f_babynames[f_babynames["Name"] == "Jennifer"]["Count"])
max_jenn
6065
# Remember that we sorted f_babynames by year.
# This means that grabbing the final entry gives us the most recent count of Jennifers: 91
# In 2021, the most recent year for which we have data, 91 Jennifers were born
curr_jenn = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"].iloc[-1]
curr_jenn
91
# Compute the RTP
curr_jenn / max_jenn
0.015004122011541632
We can also write a function that produces the ratio_to_peak
for a given Series
. This will allow us to use .groupby
to speed up our computation for all names in the dataset.
def ratio_to_peak(series):
"""
Compute the RTP for a Series containing the counts per year for a single name
"""
return series.iloc[-1] / np.max(series)
# Construct a Series containing our Jennifer count data
jenn_counts_ser = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"]
# Then, find the RTP
ratio_to_peak(jenn_counts_ser)
0.015004122011541632
We can try out various names in the cell below. Try finding the RTP for your name!
ratio_to_peak(f_babynames[f_babynames["Name"] == "Dominic"]["Count"])
0.5454545454545454
Now, let's use .groupby
to compute the RTPs for all names in the dataset.
You may see a warning message when running the cell below. As discussed in lecture, pandas
can't apply an aggregation function to non-numeric data (it doens't make sense to divide "CA" by a number). By default, .groupby
will drop any columns that cannot be aggregated.
rtp_table = f_babynames.groupby("Name").agg(ratio_to_peak)
rtp_table
# Note: If this cell crashes, comment out the code and use the female_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak) instead
/tmp/ipykernel_719/2995088074.py:1: FutureWarning: ['State', 'Sex'] did not aggregate successfully. If any error is raised this will raise in a future version of pandas. Drop these columns/ops to avoid this warning.
Year | Count | |
---|---|---|
Name | ||
Aadhira | 1.0 | 0.700000 |
Aadhya | 1.0 | 0.580000 |
Aadya | 1.0 | 0.724138 |
Aahana | 1.0 | 0.192308 |
Aahna | 1.0 | 1.000000 |
... | ... | ... |
Zyanya | 1.0 | 0.857143 |
Zyla | 1.0 | 1.000000 |
Zylah | 1.0 | 1.000000 |
Zyra | 1.0 | 1.000000 |
Zyrah | 1.0 | 0.833333 |
13661 rows × 2 columns
This is the pandas
equivalent of .group
from Data 8. If we wanted to achieve this same result using the datascience
library, we would write:
f_babynames.group("Name", ratio_to_peak)
To avoid the warning message above, we explicitly extract only the columns relevant to our analysis before using .agg
.
# Recompute the RTPs, but only performing the calculation on the "Count" column
rtp_table = f_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak)
rtp_table
Count | |
---|---|
Name | |
Aadhira | 0.700000 |
Aadhya | 0.580000 |
Aadya | 0.724138 |
Aahana | 0.192308 |
Aahna | 1.000000 |
... | ... |
Zyanya | 0.857143 |
Zyla | 1.000000 |
Zylah | 1.000000 |
Zyra | 1.000000 |
Zyrah | 0.833333 |
13661 rows × 1 columns
# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns = {"Count": "Count RTP"})
rtp_table
Count RTP | |
---|---|
Name | |
Aadhira | 0.700000 |
Aadhya | 0.580000 |
Aadya | 0.724138 |
Aahana | 0.192308 |
Aahna | 1.000000 |
... | ... |
Zyanya | 0.857143 |
Zyla | 1.000000 |
Zylah | 1.000000 |
Zyra | 1.000000 |
Zyrah | 0.833333 |
13661 rows × 1 columns
# What name has fallen the most in popularity?
rtp_table.sort_values("Count RTP")
Count RTP | |
---|---|
Name | |
Debra | 0.001260 |
Susan | 0.002034 |
Debbie | 0.002817 |
Cheryl | 0.003273 |
Carol | 0.003635 |
... | ... |
Jovi | 1.000000 |
Neta | 1.000000 |
Doni | 1.000000 |
Dondi | 1.000000 |
Kela | 1.000000 |
13661 rows × 1 columns
We can visualize the decrease in the popularity of the name "Debra:"
def plot_name(*names):
return px.line(f_babynames[f_babynames["Name"].isin(names)],
x = "Year", y = "Count", color="Name",
title=f"Popularity for: {names}")
plot_name("Debra")
# Find the 10 names that have decreased the most in popularity
top10 = rtp_table.sort_values("Count RTP").head(10).index
top10
Index(['Debra', 'Susan', 'Debbie', 'Cheryl', 'Carol', 'Tammy', 'Terri', 'Shannon', 'Deborah', 'Carolyn'], dtype='object', name='Name')
plot_name(*top10)
For fun, try plotting your name or your friends' names.
# Recall that we are only considering name entries marked with sex "F"
plot_name("Bella", "Dominic")
Which options give us the following result? These rows correspond to the first 3 baby names with "Count" > 250.
babynames.iloc[[0, 233, 484], [3, 4]]
babynames.loc[[0, 233, 484]]
babynames.loc[babynames["Count"] > 250, ["Name", "Count"]].head(3)
babynames.loc[babynames["Count"]>250, ["Name", "Count"]].iloc[0:2, :]
Write code to compute the total number of babies with each name.
babynames.groupby("Name")[["Count"]].sum()
Write code to compute the total number of babies born in each year.
babynames.groupby("Year")[["Count"]].sum()