import numpy as np
import pandas as pd
import plotly.express as px

import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "data/babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'STATE.CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.tail(10)

babynames.groupby("Year")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7c9731252ad0>

# Grouping by "Year" and aggregating the "Count" column
# to get the total number of babies born each year.
babies_by_year = babynames.groupby("Year")[["Count"]].agg(sum)
babies_by_year

/tmp/ipykernel_273/4059455554.py:3: FutureWarning: The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.
  babies_by_year = babynames.groupby("Year")[["Count"]].agg(sum)

# Plotting baby counts per year
fig = px.line(babies_by_year, y="Count")
fig.update_layout(font_size=18, 
                  autosize=False, 
                  width=700, 
                  height=400)

df = pd.DataFrame({
  'col1' : ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'C', 'B'],
  'col2' : [3, 1, 4, 1, 5, 9, 2, 5, 6], 
  'col3' : ['ak', 'tx', 'fl', 'hi', 'mi', 'ak', 'ca', 'sd', 'nc']
})
df

# When we don't specify the columns, pandas will try to apply the aggregation to all columns. See next cell for proof!
df.groupby('col1').agg(max)

/tmp/ipykernel_273/3761652406.py:2: FutureWarning:

The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.

df.groupby('col1')[['col2', 'col3']].agg(max)

/tmp/ipykernel_273/3540046338.py:1: FutureWarning:

The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.

f_babynames = babynames[babynames["Sex"]=="F"]
f_babynames

jenn_entries = f_babynames[f_babynames["Name"]=="Jennifer"]
jenn_entries

# We'll talk about how to generate plots in a later lecture
fig = px.line(jenn_entries, x="Year", y="Count")

fig.update_layout(font_size = 18, 
                  autosize=False, 
                  width=1000, 
                  height=400)

# Construct a Series containing our Jennifer count data
jenn_counts_ser = jenn_entries["Count"]

# In the year with the highest Jennifer count, 6065 Jennifers were born
max_jenn = np.max(jenn_counts_ser)
max_jenn

6065

# Remember that we sorted f_babynames by "Year". 
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born
curr_jenn = jenn_counts_ser.iloc[-1]
curr_jenn

114

# Compute the RTP
curr_jenn / max_jenn

0.018796372629843364

def ratio_to_peak(series):
    """
    Compute the RTP for a Series containing the counts per year for a single name (year column sorted ascendingly).
    """
    return series.iloc[-1] / np.max(series)

# Then, find the RTP
ratio_to_peak(jenn_counts_ser)

0.018796372629843364

rtp_table = f_babynames.groupby("Name")[["Year", "Count"]].agg(ratio_to_peak)
rtp_table

f_babynames

# Unique values in the Year column
rtp_table["Year"].unique()

array([1.])

f_babynames.sample(frac=1, replace=False).groupby("Name")[["Year", "Count"]].agg(ratio_to_peak)

# Dropping the "Year" column
rtp_table.drop("Year", axis="columns", inplace=True)
rtp_table

# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns={"Count":"Count RTP"})
rtp_table

# What name has fallen the most in popularity?
rtp_table.sort_values("Count RTP")

def plot_name(*names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)], 
                  x="Year", y="Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size=18, 
                  autosize=False, 
                  width=1000, 
                  height=400)
    return fig

plot_name("Debra")

# Find the 10 names that have decreased the most in popularity
top10 = rtp_table.sort_values("Count RTP").head(10).index
top10

Index(['Debra', 'Debbie', 'Carol', 'Tammy', 'Susan', 'Cheryl', 'Shannon',
       'Tina', 'Michele', 'Terri'],
      dtype='object', name='Name')

plot_name(*top10)

df = pd.DataFrame({'letter':['A', 'A', 'B', 'C', 'C', 'C'], 
                   'num':[1, 2, 3, 4, np.NaN, 4], 
                   'state':[np.NaN, 'tx', 'fl', 'hi', np.NaN, 'ak']})
df

df.groupby("letter").size()

letter
A    2
B    1
C    3
dtype: int64

df["letter"].value_counts()

letter
C    3
A    2
B    1
Name: count, dtype: int64

df.groupby("letter").count()

babynames.groupby("Name").filter(lambda sf: sf["Year"].min() > 2010)

babynames.groupby("Name").filter(lambda sf: sf["Year"].max() > 2010)

babynames.groupby("Name").filter(lambda sf: sf["Year"] > 2010)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[34], line 1
----> 1 babynames.groupby("Name").filter(lambda sf: sf["Year"] > 2010)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/pandas/core/groupby/generic.py:1932, in DataFrameGroupBy.filter(self, func, dropna, *args, **kwargs)
   1929             indices.append(self._get_index(name))
   1930     else:
   1931         # non scalars aren't allowed
-> 1932         raise TypeError(
   1933             f"filter function returned a {type(res).__name__}, "
   1934             "but expected a scalar bool"
   1935         )
   1937 return self._apply_filter(indices, dropna)

TypeError: filter function returned a Series, but expected a scalar bool

babynames.groupby(["Name", "Year"]).filter(lambda sf: sf["Year"] > 2010)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[35], line 1
----> 1 babynames.groupby(["Name", "Year"]).filter(lambda sf: sf["Year"] > 2010)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/pandas/core/groupby/generic.py:1932, in DataFrameGroupBy.filter(self, func, dropna, *args, **kwargs)
   1929             indices.append(self._get_index(name))
   1930     else:
   1931         # non scalars aren't allowed
-> 1932         raise TypeError(
   1933             f"filter function returned a {type(res).__name__}, "
   1934             "but expected a scalar bool"
   1935         )
   1937 return self._apply_filter(indices, dropna)

TypeError: filter function returned a Series, but expected a scalar bool

# Let's read the elections dataset
elections = pd.read_csv("data/elections.csv")
elections.sample(5)

elections.groupby("Year").filter(lambda sf: sf["%"].max() < 45).head(10)

# Why did we get a DataFrame instead of a Series?
# Notice that "%" is in its own sublist!
elections_max_percentage = elections.groupby("Year")[["%"]].agg(max)
elections_max_percentage

/tmp/ipykernel_273/3570771099.py:3: FutureWarning:

The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.

elections_max_percentage.sort_values(by="%").head()

elections.groupby("Party").agg(max).head(10)

/tmp/ipykernel_273/4278286395.py:1: FutureWarning:

The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.

elections_sorted_by_percent = elections.sort_values("%", ascending=False)
elections_sorted_by_percent.head(8)

elections_sorted_by_percent.groupby("Party").first()

elections.groupby("Party")["%"].idxmax()

Party
American                  22
American Independent     115
Anti-Masonic               6
Anti-Monopoly             38
Citizens                 127
Communist                 89
Constitution             164
Constitutional Union      24
Democratic               114
Democratic-Republican      0
Dixiecrat                103
Farmer–Labor              78
Free Soil                 15
Green                    155
Greenback                 35
Independent              143
Liberal Republican        31
Libertarian              175
Libertarian Party        186
National Democratic       50
National Republican        3
National Union            27
Natural Law              148
New Alliance             136
Northern Democratic       26
Populist                  48
Progressive               68
Prohibition               49
Reform                   150
Republican               120
Socialist                 66
Southern Democratic       25
States' Rights           110
Taxpayers                147
Union                     93
Union Labor               42
Whig                      11
Name: %, dtype: int64

# This is the computational part
best_per_party = elections.loc[elections.groupby("Party")["%"].idxmax()]

# This indexes by Party to match the formatting above
best_per_party.set_index('Party').sort_index().head()

best_per_party2 = elections.sort_values("%").drop_duplicates(["Party"], keep="last")
best_per_party2.set_index("Party").sort_index().head()  # Formatting

grouped_by_party = elections.groupby("Party")
type(grouped_by_party)

pandas.core.groupby.generic.DataFrameGroupBy

grouped_by_party.groups

{'American': [22, 126], 'American Independent': [115, 119, 124], 'Anti-Masonic': [6], 'Anti-Monopoly': [38], 'Citizens': [127], 'Communist': [89], 'Constitution': [160, 164, 172], 'Constitutional Union': [24], 'Democratic': [2, 4, 8, 10, 13, 14, 17, 20, 28, 29, 34, 37, 39, 45, 47, 52, 55, 57, 64, 70, 74, 77, 81, 83, 86, 91, 94, 97, 100, 105, 108, 111, 114, 116, 118, 123, 129, 134, 137, 140, 144, 151, 158, 162, 168, 176, 178, 183], 'Democratic-Republican': [0, 1], 'Dixiecrat': [103], 'Farmer–Labor': [78], 'Free Soil': [15, 18], 'Green': [149, 155, 156, 165, 170, 177, 181, 184], 'Greenback': [35], 'Independent': [121, 130, 143, 161, 167, 174, 185], 'Liberal Republican': [31], 'Libertarian': [125, 128, 132, 138, 139, 146, 153, 159, 163, 169, 175, 180], 'Libertarian Party': [186], 'National Democratic': [50], 'National Republican': [3, 5], 'National Union': [27], 'Natural Law': [148], 'New Alliance': [136], 'Northern Democratic': [26], 'Populist': [48, 61, 141], 'Progressive': [68, 82, 101, 107], 'Prohibition': [41, 44, 49, 51, 54, 59, 63, 67, 73, 75, 99], 'Reform': [150, 154], 'Republican': [21, 23, 30, 32, 33, 36, 40, 43, 46, 53, 56, 60, 65, 69, 72, 79, 80, 84, 87, 90, 96, 98, 104, 106, 109, 112, 113, 117, 120, 122, 131, 133, 135, 142, 145, 152, 157, 166, 171, 173, 179, 182], 'Socialist': [58, 62, 66, 71, 76, 85, 88, 92, 95, 102], 'Southern Democratic': [25], 'States' Rights': [110], 'Taxpayers': [147], 'Union': [93], 'Union Labor': [42], 'Whig': [7, 9, 11, 12, 16, 19]}

grouped_by_party.get_group("Socialist")

babynames.groupby(["Year", "Sex"])[["Count"]].sum().head(6)

babynames.pivot_table(
    index="Year", 
    columns="Sex", 
    values="Count", 
    aggfunc=np.sum).head(6)

/tmp/ipykernel_273/3588306476.py:1: FutureWarning:

The provided callable <function sum at 0x7c975ce83740> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.

babynames.pivot_table(
    index="Year", 
    columns="Sex", 
    values=["Count", "Name"], 
    aggfunc=np.max).head(6)

/tmp/ipykernel_273/3274469386.py:1: FutureWarning:

The provided callable <function max at 0x7c975ce83e20> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.

elections.head(10)

babynames_2022 = babynames[babynames["Year"]==2022]
babynames_2022.head(10)

elections["First Name"] = elections["Candidate"].str.split(" ").str[0]
elections

merged = pd.merge(left=elections, right=babynames_2022, 
                  left_on="First Name", right_on="Name")
merged

merged.sort_values("Count", ascending=False)

	Count
Year
1910	9163
1911	9983
1912	17946
1913	22094
1914	26926
...	...
2018	395436
2019	386996
2020	362882
2021	362582
2022	360023

	State	Sex	Year	Name	Count
0	CA	F	1910	Mary	295
1	CA	F	1910	Helen	239
2	CA	F	1910	Dorothy	220
3	CA	F	1910	Margaret	163
4	CA	F	1910	Frances	134
...	...	...	...	...	...
239532	CA	F	2022	Zemira	5
239533	CA	F	2022	Ziggy	5
239534	CA	F	2022	Zimal	5
239535	CA	F	2022	Zosia	5
239536	CA	F	2022	Zulay	5

	Year	Count
Name
Aadhini	1.0	1.000000
Aadhira	1.0	0.500000
Aadhya	1.0	0.660000
Aadya	1.0	0.586207
Aahana	1.0	0.269231
...	...	...
Zyanya	1.0	0.466667
Zyla	1.0	1.000000
Zylah	1.0	1.000000
Zyra	1.0	1.000000
Zyrah	1.0	0.833333

	State	Sex	Year	Name	Count
0	CA	F	1910	Mary	295
1	CA	F	1910	Helen	239
2	CA	F	1910	Dorothy	220
3	CA	F	1910	Margaret	163
4	CA	F	1910	Frances	134
...	...	...	...	...	...
239532	CA	F	2022	Zemira	5
239533	CA	F	2022	Ziggy	5
239534	CA	F	2022	Zimal	5
239535	CA	F	2022	Zosia	5
239536	CA	F	2022	Zulay	5

	Year	Count
Name
Aadhini	1.000000	1.000000
Aadhira	0.999505	0.700000
Aadhya	0.994560	0.160000
Aadya	0.998516	0.793103
Aahana	0.995054	0.653846
...	...	...
Zyanya	0.993076	0.666667
Zyla	1.000000	1.000000
Zylah	0.996044	0.642857
Zyra	0.996538	0.437500
Zyrah	1.000000	0.833333

Lecture 4 – Data 100, Summer 2025¶

Loading `babynames` Dataset¶

Grouping¶

Slido Exercise¶

Case Study: Name "Popularity"¶

Slido Exercise¶

`groupby.size` and `groupby.count()`¶

Filtering by Group¶

Slido Exercise¶

`groupby` Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative Solutions¶

`DataFrameGroupBy` Objects¶

Pivot Tables¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with Multiple values¶

Join Tables¶

	State	Sex	Year	Name	Count
407418	CA	M	2022	Zach	5
407419	CA	M	2022	Zadkiel	5
407420	CA	M	2022	Zae	5
407421	CA	M	2022	Zai	5
407422	CA	M	2022	Zay	5
407423	CA	M	2022	Zayvier	5
407424	CA	M	2022	Zia	5
407425	CA	M	2022	Zora	5
407426	CA	M	2022	Zuriel	5
407427	CA	M	2022	Zylo	5

	State	Sex	Year	Name	Count
13610	CA	F	1934	Jennifer	5
16325	CA	F	1938	Jennifer	5
16993	CA	F	1939	Jennifer	6
17533	CA	F	1940	Jennifer	13
18210	CA	F	1941	Jennifer	24
...	...	...	...	...	...
221406	CA	F	2018	Jennifer	167
225149	CA	F	2019	Jennifer	145
228787	CA	F	2020	Jennifer	141
232561	CA	F	2021	Jennifer	91
236136	CA	F	2022	Jennifer	114

	Count RTP
Name
Debra	0.001260
Debbie	0.002815
Carol	0.003180
Tammy	0.003249
Susan	0.003305
...	...
Zyla	1.000000
Zylah	1.000000
Zyra	1.000000
Aahna	1.000000
Aadhini	1.000000

	State	Sex	Year	Name	Count
195266	CA	F	2011	Mileidy	15
195280	CA	F	2011	Solara	15
195370	CA	F	2011	Yorley	14
195705	CA	F	2011	Mileydi	11
195832	CA	F	2011	Kensie	10
...	...	...	...	...	...
407420	CA	M	2022	Zae	5
407421	CA	M	2022	Zai	5
407422	CA	M	2022	Zay	5
407423	CA	M	2022	Zayvier	5
407427	CA	M	2022	Zylo	5

	Year	Candidate	Party	Popular vote	Result	%
171	2012	Mitt Romney	Republican	60933504	loss	47.384076
118	1972	George McGovern	Democratic	29173222	loss	37.670670
125	1976	Roger MacBride	Libertarian	172557	loss	0.212451
55	1900	William Jennings Bryan	Democratic	6370932	loss	46.130540
142	1992	George H. W. Bush	Republican	39104550	loss	37.544784

	Year	Candidate	Party	Popular vote	Result	%
23	1860	Abraham Lincoln	Republican	1855993	win	39.699408
24	1860	John Bell	Constitutional Union	590901	loss	12.639283
25	1860	John C. Breckinridge	Southern Democratic	848019	loss	18.138998
26	1860	Stephen A. Douglas	Northern Democratic	1380202	loss	29.522311
66	1912	Eugene V. Debs	Socialist	901551	loss	6.004354
67	1912	Eugene W. Chafin	Prohibition	208156	loss	1.386325
68	1912	Theodore Roosevelt	Progressive	4122721	loss	27.457433
69	1912	William Taft	Republican	3486242	loss	23.218466
70	1912	Woodrow Wilson	Democratic	6296284	win	41.933422
115	1968	George Wallace	American Independent	9901118	loss	13.571218

	%
Year
1824	57.210122
1828	56.203927
1832	54.574789
1836	52.272472
1840	53.051213
1844	50.749477
1848	47.309296
1852	51.013168
1856	45.306080
1860	39.699408
1864	54.951512
1868	52.665305
1872	55.928594
1876	51.528376
1880	48.369234
1884	48.884933
1888	48.656799
1892	46.121393
1896	51.213817
1900	52.342640
1904	56.562787
1908	52.013300
1912	41.933422
1916	49.367987
1920	60.574501
1924	54.329113
1928	58.368524
1932	57.672125
1936	60.978107
1940	54.871202
1944	53.773801
1948	49.601536
1952	55.325173
1956	57.650654
1960	50.082561
1964	61.344703
1968	43.565246
1972	60.907806
1976	50.271900
1980	50.897944
1984	59.023326
1988	53.518845
1992	43.118485
1996	49.296938
2000	48.491813
2004	50.771824
2008	53.023510
2012	51.258484
2016	48.521539
2020	51.311515
2024	49.808629

	Year	Candidate	Popular vote	Result	%
Party
American	1976	Thomas J. Anderson	873053	loss	21.554001
American Independent	1976	Lester Maddox	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2016	Michael Peroutka	203091	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	2024	Woodrow Wilson	81268924	win	61.344703
Democratic-Republican	1824	John Quincy Adams	151271	win	57.210122

	Year	Candidate	Party	Popular vote	Result	%
114	1964	Lyndon Johnson	Democratic	43127041	win	61.344703
91	1936	Franklin Roosevelt	Democratic	27752648	win	60.978107
120	1972	Richard Nixon	Republican	47168710	win	60.907806
79	1920	Warren Harding	Republican	16144093	win	60.574501
133	1984	Ronald Reagan	Republican	54455472	win	59.023326
84	1928	Herbert Hoover	Republican	21427123	win	58.368524
86	1932	Franklin Roosevelt	Democratic	22821277	win	57.672125
109	1956	Dwight Eisenhower	Republican	35579180	win	57.650654

		Count
Year	Sex
1910	F	5950
1910	M	3213
1911	F	6602
1911	M	3381
1912	F	9804
1912	M	8142

	col1	col2	col3
0	A	3	ak
1	B	1	tx
2	C	4	fl
3	A	1	hi
4	B	5	mi
5	C	9	ak
6	A	2	ca
7	C	5	sd
8	B	6	nc

	col2	col3
col1
A	3	hi
B	6	tx
C	9	sd

	col2	col3
col1
A	3	hi
B	6	tx
C	9	sd

	letter	num	state
0	A	1.0	NaN
1	A	2.0	tx
2	B	3.0	fl
3	C	4.0	hi
4	C	NaN	NaN
5	C	4.0	ak

	Count		Name
Sex	F	M	F	M
Year
1910	295	237	Yvonne	William
1911	390	214	Zelma	Willis
1912	534	501	Yvonne	Woodrow
1913	584	614	Zelma	Yoshio
1914	773	769	Zelma	Yoshio
1915	998	1033	Zita	Yukio

	Year	Candidate	Party	Popular vote	Result	%
0	1824	Andrew Jackson	Democratic-Republican	151271	loss	57.210122
1	1824	John Quincy Adams	Democratic-Republican	113142	win	42.789878
2	1828	Andrew Jackson	Democratic	642806	win	56.203927
3	1828	John Quincy Adams	National Republican	500897	loss	43.796073
4	1832	Andrew Jackson	Democratic	702735	win	54.574789
5	1832	Henry Clay	National Republican	484205	loss	37.603628
6	1832	William Wirt	Anti-Masonic	100715	loss	7.821583
7	1836	Hugh Lawson White	Whig	146109	loss	10.005985
8	1836	Martin Van Buren	Democratic	763291	win	52.272472
9	1836	William Henry Harrison	Whig	550816	loss	37.721543

	State	Sex	Year	Name	Count
235835	CA	F	2022	Olivia	2178
235836	CA	F	2022	Emma	2080
235837	CA	F	2022	Camila	2046
235838	CA	F	2022	Mia	1882
235839	CA	F	2022	Sophia	1762
235840	CA	F	2022	Isabella	1733
235841	CA	F	2022	Luna	1516
235842	CA	F	2022	Sofia	1307
235843	CA	F	2022	Amelia	1289
235844	CA	F	2022	Gianna	1107

Lecture 4 – Data 100, Summer 2025¶

Loading babynames Dataset¶

Grouping¶

Slido Exercise¶

Case Study: Name "Popularity"¶

Slido Exercise¶

groupby.size and groupby.count()¶

Filtering by Group¶

Slido Exercise¶

groupby Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative Solutions¶

DataFrameGroupBy Objects¶

Pivot Tables¶

Groupby with multiple columns¶

pivot_table¶

pivot_table with Multiple values¶

Join Tables¶

Loading `babynames` Dataset¶

`groupby.size` and `groupby.count()`¶

`groupby` Puzzle¶

`DataFrameGroupBy` Objects¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with Multiple values¶