import numpy as np
import pandas as pd
import plotly.express as px

import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "data/babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'STATE.CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.tail(10)

# Create a Series of the length of each name
babyname_lengths = babynames["Name"].str.len()

# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babyname_lengths
babynames.head(5)

# Sort by the temporary column
babynames = babynames.sort_values(by="name_lengths", ascending=False)
babynames.head(5)

# Drop the 'name_lengths' column
babynames = babynames.drop("name_lengths", axis="columns")
babynames.head(5)

babynames.sort_values(by="Name", key=lambda x:x.str.len(), ascending=False).head()

# First, define a function to count the number of times "dr" or "ea" appear in each name
def dr_ea_count(string):
    return string.count('dr') + string.count('ea')

# Then, use `map` to apply `dr_ea_count` to each name in the "Name" column
babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)

# Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork
babynames = babynames.sort_values(by="dr_ea_count", ascending=False)
babynames.head()

# Drop the `dr_ea_count` column
babynames = babynames.drop("dr_ea_count", axis="columns")
babynames.head(5)

babynames.groupby("Year")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7c77cd79b690>

# Selecting only numerical columns to perform grouping on and then grouping by "Year"
babies_by_year = babynames[["Year", "Count"]].groupby("Year").agg(sum)
babies_by_year

# Plotting baby counts per year
fig = px.line(babies_by_year, y="Count")
fig.update_layout(font_size=18, 
                  autosize=False, 
                  width=700, 
                  height=400)

ds = pd.DataFrame(dict(x=[3, 1, 4, 1, 5, 9, 2, 5, 6], 
                      y=['ak', 'tx', 'fl', 'hi', 'mi', 'ak', 'ca', 'sd', 'nc']), 
                      index=list('ABCABCACB') )
ds

# Performing groupby on the first column with max aggregation function
ds.groupby(ds.index).agg(max)

f_babynames = babynames[babynames["Sex"]=="F"]
f_babynames

# We sort the data by "Year"
f_babynames = f_babynames.sort_values("Year")
f_babynames

jenn_entries = f_babynames[f_babynames["Name"]=="Jennifer"]
jenn_entries

# We'll talk about how to generate plots in a later lecture
fig = px.line(jenn_entries, x="Year", y="Count")

fig.update_layout(font_size = 18, 
                  autosize=False, 
                  width=1000, 
                  height=400)

# Construct a Series containing our Jennifer count data
jenn_counts_ser = jenn_entries["Count"]

# In the year with the highest Jennifer count, 6065 Jennifers were born
max_jenn = np.max(jenn_counts_ser)
max_jenn

6065

# Remember that we sorted f_babynames by "Year". 
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born
curr_jenn = jenn_counts_ser.iloc[-1]
curr_jenn

114

# Compute the RTP
curr_jenn / max_jenn

0.018796372629843364

def ratio_to_peak(series):
    """
    Compute the RTP for a Series containing the counts per year for a single name (year column sorted ascendingly).
    """
    return series.iloc[-1] / np.max(series)

# Then, find the RTP
ratio_to_peak(jenn_counts_ser)

0.018796372629843364

rtp_table = f_babynames.groupby("Name")[["Year", "Count"]].agg(ratio_to_peak)
rtp_table

# Results in a TypeError
# rtp_table = f_babynames.groupby("Name").agg(ratio_to_peak)
# rtp_table

# Unique values in the Year column
rtp_table["Year"].unique()

array([1.])

# Dropping the "Year" column
rtp_table.drop("Year", axis="columns", inplace=True)
rtp_table

# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns={"Count":"Count RTP"})
rtp_table

# What name has fallen the most in popularity?
rtp_table.sort_values("Count RTP")

def plot_name(*names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)], 
                  x="Year", y="Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size=18, 
                  autosize=False, 
                  width=1000, 
                  height=400)
    return fig

plot_name("Debra")

# Find the 10 names that have decreased the most in popularity
top10 = rtp_table.sort_values("Count RTP").head(10).index
top10

Index(['Debra', 'Debbie', 'Carol', 'Tammy', 'Susan', 'Cheryl', 'Shannon',
       'Tina', 'Michele', 'Terri'],
      dtype='object', name='Name')

plot_name(*top10)

df = pd.DataFrame({'letter':['A', 'A', 'B', 'C', 'C', 'C'], 
                   'num':[1, 2, 3, 4, np.NaN, 4], 
                   'state':[np.NaN, 'tx', 'fl', 'hi', np.NaN, 'ak']})
df

df.groupby("letter").size()

letter
A    2
B    1
C    3
dtype: int64

df.groupby("letter").count()

df["letter"].value_counts()

letter
C    3
A    2
B    1
Name: count, dtype: int64

# Let's read the elections dataset
elections = pd.read_csv("data/elections.csv")
elections.sample(5)

elections.groupby("Year").filter(lambda sf: sf["%"].max() < 45).head(10)

elections_max_percentage = elections.groupby("Year")[["%"]].agg(max)
elections_max_percentage

elections_max_percentage.sort_values(by="%").head()

elections.groupby("Party").agg(max).head(10)

elections_sorted_by_percent = elections.sort_values("%", ascending=False)
elections_sorted_by_percent.head(8)

elections_sorted_by_percent.groupby("Party").first()

elections.groupby("Party")["%"].idxmax()

Party
American                  22
American Independent     115
Anti-Masonic               6
Anti-Monopoly             38
Citizens                 127
Communist                 89
Constitution             164
Constitutional Union      24
Democratic               114
Democratic-Republican      0
Dixiecrat                103
Farmer–Labor              78
Free Soil                 15
Green                    155
Greenback                 35
Independent              143
Liberal Republican        31
Libertarian              175
Libertarian Party        186
National Democratic       50
National Republican        3
National Union            27
Natural Law              148
New Alliance             136
Northern Democratic       26
Populist                  48
Progressive               68
Prohibition               49
Reform                   150
Republican               120
Socialist                 66
Southern Democratic       25
States' Rights           110
Taxpayers                147
Union                     93
Union Labor               42
Whig                      11
Name: %, dtype: int64

# This is the computational part
best_per_party = elections.loc[elections.groupby("Party")["%"].idxmax()]

# This indexes by Party to match the formatting above
best_per_party.set_index('Party').sort_index().head()

best_per_party2 = elections.sort_values("%").drop_duplicates(["Party"], keep="last")
best_per_party2.set_index("Party").sort_index().head()  # Formatting

grouped_by_party = elections.groupby("Party")
type(grouped_by_party)

pandas.core.groupby.generic.DataFrameGroupBy

grouped_by_party.groups

{'American': [22, 126], 'American Independent': [115, 119, 124], 'Anti-Masonic': [6], 'Anti-Monopoly': [38], 'Citizens': [127], 'Communist': [89], 'Constitution': [160, 164, 172], 'Constitutional Union': [24], 'Democratic': [2, 4, 8, 10, 13, 14, 17, 20, 28, 29, 34, 37, 39, 45, 47, 52, 55, 57, 64, 70, 74, 77, 81, 83, 86, 91, 94, 97, 100, 105, 108, 111, 114, 116, 118, 123, 129, 134, 137, 140, 144, 151, 158, 162, 168, 176, 178, 183], 'Democratic-Republican': [0, 1], 'Dixiecrat': [103], 'Farmer–Labor': [78], 'Free Soil': [15, 18], 'Green': [149, 155, 156, 165, 170, 177, 181, 184], 'Greenback': [35], 'Independent': [121, 130, 143, 161, 167, 174, 185], 'Liberal Republican': [31], 'Libertarian': [125, 128, 132, 138, 139, 146, 153, 159, 163, 169, 175, 180], 'Libertarian Party': [186], 'National Democratic': [50], 'National Republican': [3, 5], 'National Union': [27], 'Natural Law': [148], 'New Alliance': [136], 'Northern Democratic': [26], 'Populist': [48, 61, 141], 'Progressive': [68, 82, 101, 107], 'Prohibition': [41, 44, 49, 51, 54, 59, 63, 67, 73, 75, 99], 'Reform': [150, 154], 'Republican': [21, 23, 30, 32, 33, 36, 40, 43, 46, 53, 56, 60, 65, 69, 72, 79, 80, 84, 87, 90, 96, 98, 104, 106, 109, 112, 113, 117, 120, 122, 131, 133, 135, 142, 145, 152, 157, 166, 171, 173, 179, 182], 'Socialist': [58, 62, 66, 71, 76, 85, 88, 92, 95, 102], 'Southern Democratic': [25], 'States' Rights': [110], 'Taxpayers': [147], 'Union': [93], 'Union Labor': [42], 'Whig': [7, 9, 11, 12, 16, 19]}

grouped_by_party.get_group("Socialist")

babynames.groupby(["Year", "Sex"])[["Count"]].sum().head(6)

babynames.pivot_table(
    index="Year", 
    columns="Sex", 
    values="Count", 
    aggfunc=np.sum).head(6)

babynames.pivot_table(
    index="Year", 
    columns="Sex", 
    values=["Count", "Name"], 
    aggfunc=np.max).head(6)

elections.head(10)

babynames_2022 = babynames[babynames["Year"]==2022]
babynames_2022.head(10)

elections["First Name"] = elections["Candidate"].str.split(" ").str[0]
elections

merged = pd.merge(left=elections, right=babynames_2022, 
                  left_on="First Name", right_on="Name")
merged

merged.sort_values("Count", ascending=False)

	State	Sex	Year	Name	Count
334166	CA	M	1996	Franciscojavier	8
337301	CA	M	1997	Franciscojavier	5
339472	CA	M	1998	Franciscojavier	6
321792	CA	M	1991	Ryanchristopher	7
327358	CA	M	1993	Johnchristopher	5

	State	Sex	Year	Name	Count
334166	CA	M	1996	Franciscojavier	8
327472	CA	M	1993	Ryanchristopher	5
337301	CA	M	1997	Franciscojavier	5
337477	CA	M	1997	Ryanchristopher	5
312543	CA	M	1987	Franciscojavier	5

	State	Sex	Year	Name	Count	dr_ea_count
115957	CA	F	1990	Deandrea	5	3
101976	CA	F	1986	Deandrea	6	3
131029	CA	F	1994	Leandrea	5	3
108731	CA	F	1988	Deandrea	5	3
308131	CA	M	1985	Deandrea	6	3

	State	Sex	Year	Name	Count
115957	CA	F	1990	Deandrea	5
101976	CA	F	1986	Deandrea	6
131029	CA	F	1994	Leandrea	5
108731	CA	F	1988	Deandrea	5
308131	CA	M	1985	Deandrea	6

	Count
Year
1910	9163
1911	9983
1912	17946
1913	22094
1914	26926
...	...
2018	395436
2019	386996
2020	362882
2021	362582
2022	360023

Lecture 4 – Data 100, Spring 2025¶

Loading `babynames` Dataset¶

Custom sorting¶

Approach 1: Create a temporary column¶

Approach 2: Sorting using the `key` argument¶

Approach 3: Sorting Using the `map` Function¶

Grouping¶

Slido Exercise¶

Case Study: Name "Popularity"¶

Slido Exercise¶

`groupby.size` and `groupby.count()`¶

Filtering by Group¶

`groupby` Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative Solutions¶

`DataFrameGroupBy` Objects¶

Pivot Tables¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with Multiple values¶

Join Tables¶

	State	Sex	Year	Name	Count
407418	CA	M	2022	Zach	5
407419	CA	M	2022	Zadkiel	5
407420	CA	M	2022	Zae	5
407421	CA	M	2022	Zai	5
407422	CA	M	2022	Zay	5
407423	CA	M	2022	Zayvier	5
407424	CA	M	2022	Zia	5
407425	CA	M	2022	Zora	5
407426	CA	M	2022	Zuriel	5
407427	CA	M	2022	Zylo	5

	State	Sex	Year	Name	Count	name_lengths
0	CA	F	1910	Mary	295	4
1	CA	F	1910	Helen	239	5
2	CA	F	1910	Dorothy	220	7
3	CA	F	1910	Margaret	163	8
4	CA	F	1910	Frances	134	7

	State	Sex	Year	Name	Count
23	CA	F	1910	Bernice	59
219	CA	F	1910	Katharine	5
170	CA	F	1910	Clarice	7
26	CA	F	1910	Doris	56
30	CA	F	1910	Ethel	52
...	...	...	...	...	...
236551	CA	F	2022	Dani	47
235914	CA	F	2022	Iris	361
235918	CA	F	2022	Lucy	345
235973	CA	F	2022	Sage	226
239452	CA	F	2022	Rylin	5

	State	Sex	Year	Name	Count
13610	CA	F	1934	Jennifer	5
16325	CA	F	1938	Jennifer	5
16993	CA	F	1939	Jennifer	6
17533	CA	F	1940	Jennifer	13
18210	CA	F	1941	Jennifer	24
...	...	...	...	...	...
221406	CA	F	2018	Jennifer	167
225149	CA	F	2019	Jennifer	145
228787	CA	F	2020	Jennifer	141
232561	CA	F	2021	Jennifer	91
236136	CA	F	2022	Jennifer	114

	Year	Count
Name
Aadhini	1.0	1.000000
Aadhira	1.0	0.500000
Aadhya	1.0	0.660000
Aadya	1.0	0.586207
Aahana	1.0	0.269231
...	...	...
Zyanya	1.0	0.466667
Zyla	1.0	1.000000
Zylah	1.0	1.000000
Zyra	1.0	1.000000
Zyrah	1.0	0.833333

	Count RTP
Name
Debra	0.001260
Debbie	0.002815
Carol	0.003180
Tammy	0.003249
Susan	0.003305
...	...
Fidelia	1.000000
Naveyah	1.000000
Finlee	1.000000
Roseline	1.000000
Aadhini	1.000000

	Year	Candidate	Party	Popular vote	Result	%
163	2008	Bob Barr	Libertarian	523715	loss	0.399565
58	1904	Eugene V. Debs	Socialist	402810	loss	2.985897
150	1996	Ross Perot	Reform	8085294	loss	8.408844
48	1892	James B. Weaver	Populist	1041028	loss	8.645038
118	1972	George McGovern	Democratic	29173222	loss	37.670670

	Year	Candidate	Party	Popular vote	Result	%
23	1860	Abraham Lincoln	Republican	1855993	win	39.699408
24	1860	John Bell	Constitutional Union	590901	loss	12.639283
25	1860	John C. Breckinridge	Southern Democratic	848019	loss	18.138998
26	1860	Stephen A. Douglas	Northern Democratic	1380202	loss	29.522311
66	1912	Eugene V. Debs	Socialist	901551	loss	6.004354
67	1912	Eugene W. Chafin	Prohibition	208156	loss	1.386325
68	1912	Theodore Roosevelt	Progressive	4122721	loss	27.457433
69	1912	William Taft	Republican	3486242	loss	23.218466
70	1912	Woodrow Wilson	Democratic	6296284	win	41.933422
115	1968	George Wallace	American Independent	9901118	loss	13.571218

	%
Year
1824	57.210122
1828	56.203927
1832	54.574789
1836	52.272472
1840	53.051213
1844	50.749477
1848	47.309296
1852	51.013168
1856	45.306080
1860	39.699408
1864	54.951512
1868	52.665305
1872	55.928594
1876	51.528376
1880	48.369234
1884	48.884933
1888	48.656799
1892	46.121393
1896	51.213817
1900	52.342640
1904	56.562787
1908	52.013300
1912	41.933422
1916	49.367987
1920	60.574501
1924	54.329113
1928	58.368524
1932	57.672125
1936	60.978107
1940	54.871202
1944	53.773801
1948	49.601536
1952	55.325173
1956	57.650654
1960	50.082561
1964	61.344703
1968	43.565246
1972	60.907806
1976	50.271900
1980	50.897944
1984	59.023326
1988	53.518845
1992	43.118485
1996	49.296938
2000	48.491813
2004	50.771824
2008	53.023510
2012	51.258484
2016	48.521539
2020	51.311515
2024	49.808629

	Year	Candidate	Popular vote	Result	%
Party
American	1976	Thomas J. Anderson	873053	loss	21.554001
American Independent	1976	Lester Maddox	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2016	Michael Peroutka	203091	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	2024	Woodrow Wilson	81268924	win	61.344703
Democratic-Republican	1824	John Quincy Adams	151271	win	57.210122

	letter	num	state
0	A	1.0	NaN
1	A	2.0	tx
2	B	3.0	fl
3	C	4.0	hi
4	C	NaN	NaN
5	C	4.0	ak

	num	state
letter
A	2	1
B	1	1
C	2	2

Sex	F	M
Year
1910	5950	3213
1911	6602	3381
1912	9804	8142
1913	11860	10234
1914	13815	13111
1915	18643	17192

	Count		Name
Sex	F	M	F	M
Year
1910	295	237	Yvonne	William
1911	390	214	Zelma	Willis
1912	534	501	Yvonne	Woodrow
1913	584	614	Zelma	Yoshio
1914	773	769	Zelma	Yoshio
1915	998	1033	Zita	Yukio

	Year	Candidate	Party	Popular vote	Result	%
114	1964	Lyndon Johnson	Democratic	43127041	win	61.344703
91	1936	Franklin Roosevelt	Democratic	27752648	win	60.978107
120	1972	Richard Nixon	Republican	47168710	win	60.907806
79	1920	Warren Harding	Republican	16144093	win	60.574501
133	1984	Ronald Reagan	Republican	54455472	win	59.023326
84	1928	Herbert Hoover	Republican	21427123	win	58.368524
86	1932	Franklin Roosevelt	Democratic	22821277	win	57.672125
109	1956	Dwight Eisenhower	Republican	35579180	win	57.650654

	Year	Candidate	Party	Popular vote	Result	%
0	1824	Andrew Jackson	Democratic-Republican	151271	loss	57.210122
1	1824	John Quincy Adams	Democratic-Republican	113142	win	42.789878
2	1828	Andrew Jackson	Democratic	642806	win	56.203927
3	1828	John Quincy Adams	National Republican	500897	loss	43.796073
4	1832	Andrew Jackson	Democratic	702735	win	54.574789
5	1832	Henry Clay	National Republican	484205	loss	37.603628
6	1832	William Wirt	Anti-Masonic	100715	loss	7.821583
7	1836	Hugh Lawson White	Whig	146109	loss	10.005985
8	1836	Martin Van Buren	Democratic	763291	win	52.272472
9	1836	William Henry Harrison	Whig	550816	loss	37.721543

	State	Sex	Year	Name	Count
237964	CA	F	2022	Leandra	10
404916	CA	M	2022	Leandro	99
405892	CA	M	2022	Andreas	14
235927	CA	F	2022	Andrea	322
405695	CA	M	2022	Deandre	18
237588	CA	F	2022	Andromeda	13
238547	CA	F	2022	Jeannette	7
405620	CA	M	2022	Rudra	20
406178	CA	M	2022	Alexandre	10
239282	CA	F	2022	Kahealani	5

Lecture 4 – Data 100, Spring 2025¶

Loading babynames Dataset¶

Custom sorting¶

Approach 1: Create a temporary column¶

Approach 2: Sorting using the key argument¶

Approach 3: Sorting Using the map Function¶

Grouping¶

Slido Exercise¶

Case Study: Name "Popularity"¶

Slido Exercise¶

groupby.size and groupby.count()¶

Filtering by Group¶

groupby Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative Solutions¶

DataFrameGroupBy Objects¶

Pivot Tables¶

Groupby with multiple columns¶

pivot_table¶

pivot_table with Multiple values¶

Join Tables¶

Loading `babynames` Dataset¶

Approach 2: Sorting using the `key` argument¶

Approach 3: Sorting Using the `map` Function¶

`groupby.size` and `groupby.count()`¶

`groupby` Puzzle¶

`DataFrameGroupBy` Objects¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with Multiple values¶