import numpy as np
import pandas as pd
import plotly.express as px

ds = pd.DataFrame(dict(x=[3, 1, 4, 1, 5, 9, 2, 5, 6], 
                      y=['ak', 'tx', 'fl', 'hi', 'mi', 'ak', 'ca', 'sd', 'nc']), 
                      index=list('ABCABCACB') )
ds

ds.groupby(ds.index).agg(max)

import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "data/babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'STATE.CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.tail(10)

f_babynames = babynames[babynames["Sex"] == "F"]
f_babynames

# We sort the data by year
f_babynames = f_babynames.sort_values("Year")
f_babynames

# We'll talk about how to generate plots in a later lecture
fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18, 
                  autosize=False, 
                  width=1000, 
                  height=400)

# In the year with the highest Jennifer count, 6065 Jennifers were born
max_jenn = np.max(f_babynames[f_babynames["Name"] == "Jennifer"]["Count"])
max_jenn

6065

# Remember that we sorted f_babynames by year. 
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born
curr_jenn = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"].iloc[-1]
curr_jenn

114

# Compute the RTP
curr_jenn / max_jenn

0.018796372629843364

def ratio_to_peak(series):
    """
    Compute the RTP for a Series containing the counts per year for a single name
    """
    return series.iloc[-1] / np.max(series)

# Construct a Series containing our Jennifer count data
jenn_counts_ser = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"]

# Then, find the RTP
ratio_to_peak(jenn_counts_ser)

0.018796372629843364

rtp_table = f_babynames.groupby("Name")[["Year", "Count"]].agg(ratio_to_peak)
rtp_table

# Results in a TypeError
# rtp_table = f_babynames.groupby("Name").agg(ratio_to_peak)
# rtp_table

# Unique values in the Year column
rtp_table["Year"].unique()

array([1.])

# Dropping the Year column
rtp_table.drop("Year", axis="columns", inplace=True)
rtp_table

# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns = {"Count": "Count RTP"})
rtp_table

# What name has fallen the most in popularity?
rtp_table.sort_values("Count RTP")

def plot_name(*names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)], 
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size = 18, 
                  autosize=False, 
                  width=1000, 
                  height=400)
    return fig

plot_name("Debra")

# Find the 10 names that have decreased the most in popularity
top10 = rtp_table.sort_values("Count RTP").head(10).index
top10

Index(['Debra', 'Debbie', 'Carol', 'Tammy', 'Susan', 'Cheryl', 'Shannon',
       'Tina', 'Michele', 'Terri'],
      dtype='object', name='Name')

plot_name(*top10)

puzzle1 = babynames.groupby("Name")[["Count"]].sum()
# OR
# puzzle1 = babynames.groupby("Name")[["Count"]].agg(sum)

puzzle1

# puzzle2 = babynames.groupby("Year")[["Count"]].sum()
# OR
puzzle2 = babynames.groupby("Year").sum(numeric_only=True)

puzzle2

# Plotting baby counts per year
fig = px.line(puzzle2, y = "Count")
fig.update_layout(font_size = 18, 
                  autosize=False, 
                  width=700, 
                  height=400)

df = pd.DataFrame({'letter':['A', 'A', 'B', 'C', 'C', 'C'], 
                   'num':[1, 2, 3, 4, np.NaN, 4], 
                   'state':[np.NaN, 'tx', 'fl', 'hi', np.NaN, 'ak']})
df

df.groupby("letter").size()

letter
A    2
B    1
C    3
dtype: int64

df.groupby("letter").count()

df["letter"].value_counts()

letter
C    3
A    2
B    1
Name: count, dtype: int64

# Let's read the elections dataset
elections = pd.read_csv("data/elections.csv")
elections.sample(5)

elections.groupby("Year").filter(lambda sf: sf["%"].max() < 45).head(10)

elections.groupby("Party").agg(max).head(10)

elections_sorted_by_percent = elections.sort_values("%", ascending=False)
elections_sorted_by_percent.head(8)

elections_sorted_by_percent.groupby("Party").first()

elections.groupby("Party")["%"].idxmax()

Party
American                  22
American Independent     115
Anti-Masonic               6
Anti-Monopoly             38
Citizens                 127
Communist                 89
Constitution             164
Constitutional Union      24
Democratic               114
Democratic-Republican      0
Dixiecrat                103
Farmer–Labor              78
Free Soil                 15
Green                    155
Greenback                 35
Independent              143
Liberal Republican        31
Libertarian              175
National Democratic       50
National Republican        3
National Union            27
Natural Law              148
New Alliance             136
Northern Democratic       26
Populist                  48
Progressive               68
Prohibition               49
Reform                   150
Republican               120
Socialist                 66
Southern Democratic       25
States' Rights           110
Taxpayers                147
Union                     93
Union Labor               42
Whig                      11
Name: %, dtype: int64

# This is the computational part
best_per_party = elections.loc[elections.groupby("Party")["%"].idxmax()]

# This indexes by Party to match the formatting above
best_per_party.set_index('Party').sort_index().head()

best_per_party2 = elections.sort_values("%").drop_duplicates(["Party"], keep="last")
best_per_party2.set_index("Party").sort_index().head()  # Formatting

grouped_by_party = elections.groupby("Party")
type(grouped_by_party)

pandas.core.groupby.generic.DataFrameGroupBy

grouped_by_party.groups

{'American': [22, 126], 'American Independent': [115, 119, 124], 'Anti-Masonic': [6], 'Anti-Monopoly': [38], 'Citizens': [127], 'Communist': [89], 'Constitution': [160, 164, 172], 'Constitutional Union': [24], 'Democratic': [2, 4, 8, 10, 13, 14, 17, 20, 28, 29, 34, 37, 39, 45, 47, 52, 55, 57, 64, 70, 74, 77, 81, 83, 86, 91, 94, 97, 100, 105, 108, 111, 114, 116, 118, 123, 129, 134, 137, 140, 144, 151, 158, 162, 168, 176, 178], 'Democratic-Republican': [0, 1], 'Dixiecrat': [103], 'Farmer–Labor': [78], 'Free Soil': [15, 18], 'Green': [149, 155, 156, 165, 170, 177, 181], 'Greenback': [35], 'Independent': [121, 130, 143, 161, 167, 174], 'Liberal Republican': [31], 'Libertarian': [125, 128, 132, 138, 139, 146, 153, 159, 163, 169, 175, 180], 'National Democratic': [50], 'National Republican': [3, 5], 'National Union': [27], 'Natural Law': [148], 'New Alliance': [136], 'Northern Democratic': [26], 'Populist': [48, 61, 141], 'Progressive': [68, 82, 101, 107], 'Prohibition': [41, 44, 49, 51, 54, 59, 63, 67, 73, 75, 99], 'Reform': [150, 154], 'Republican': [21, 23, 30, 32, 33, 36, 40, 43, 46, 53, 56, 60, 65, 69, 72, 79, 80, 84, 87, 90, 96, 98, 104, 106, 109, 112, 113, 117, 120, 122, 131, 133, 135, 142, 145, 152, 157, 166, 171, 173, 179], 'Socialist': [58, 62, 66, 71, 76, 85, 88, 92, 95, 102], 'Southern Democratic': [25], 'States' Rights': [110], 'Taxpayers': [147], 'Union': [93], 'Union Labor': [42], 'Whig': [7, 9, 11, 12, 16, 19]}

grouped_by_party.get_group("Socialist")

babynames.groupby(["Year", "Sex"])[["Count"]].sum().head(6)

babynames.pivot_table(
    index = "Year", 
    columns = "Sex", 
    values = "Count", 
    aggfunc = np.sum).head(6)

babynames.pivot_table(
    index = "Year", 
    columns = "Sex", 
    values = ["Count", "Name"], 
    aggfunc = np.max).head(6)

elections.head(10)

babynames_2022 = babynames[babynames["Year"] == 2022]
babynames_2022.head(10)

elections["First Name"] = elections["Candidate"].str.split(" ").str[0]
elections

merged = pd.merge(left = elections, right = babynames_2022, 
                  left_on = "First Name", right_on = "Name")
merged

merged.sort_values("Count", ascending=False)

	State	Sex	Year	Name	Count
0	CA	F	1910	Mary	295
1	CA	F	1910	Helen	239
2	CA	F	1910	Dorothy	220
3	CA	F	1910	Margaret	163
4	CA	F	1910	Frances	134
...	...	...	...	...	...
239532	CA	F	2022	Zemira	5
239533	CA	F	2022	Ziggy	5
239534	CA	F	2022	Zimal	5
239535	CA	F	2022	Zosia	5
239536	CA	F	2022	Zulay	5

	Year	Count
Name
Aadhini	1.0	1.000000
Aadhira	1.0	0.500000
Aadhya	1.0	0.660000
Aadya	1.0	0.586207
Aahana	1.0	0.269231
...	...	...
Zyanya	1.0	0.466667
Zyla	1.0	1.000000
Zylah	1.0	1.000000
Zyra	1.0	1.000000
Zyrah	1.0	0.833333

	Count
Name
Aadhini	1.000000
Aadhira	0.500000
Aadhya	0.660000
Aadya	0.586207
Aahana	0.269231
...	...
Zyanya	0.466667
Zyla	1.000000
Zylah	1.000000
Zyra	1.000000
Zyrah	0.833333

	Count RTP
Name
Aadhini	1.000000
Aadhira	0.500000
Aadhya	0.660000
Aadya	0.586207
Aahana	0.269231
...	...
Zyanya	0.466667
Zyla	1.000000
Zylah	1.000000
Zyra	1.000000
Zyrah	0.833333

	Count RTP
Name
Debra	0.001260
Debbie	0.002815
Carol	0.003180
Tammy	0.003249
Susan	0.003305
...	...
Fidelia	1.000000
Naveyah	1.000000
Finlee	1.000000
Roseline	1.000000
Aadhini	1.000000

Lecture 4 – Data 100, Fall 2023¶

More on `Groupby`¶

Slido Exercise¶

Loading `babynames` Dataset¶

Case Study: Name "Popularity"¶

Slido Exercise¶

Slido Exercise¶

Slido Exercise¶

`groupby.size` and `groupby.count()`¶

Filtering by Group¶

`groupby` Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative Solutions¶

`DataFrameGroupBy` Objects¶

Pivot Tables¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with Multiple values¶

Join Tables¶

	State	Sex	Year	Name	Count
407418	CA	M	2022	Zach	5
407419	CA	M	2022	Zadkiel	5
407420	CA	M	2022	Zae	5
407421	CA	M	2022	Zai	5
407422	CA	M	2022	Zay	5
407423	CA	M	2022	Zayvier	5
407424	CA	M	2022	Zia	5
407425	CA	M	2022	Zora	5
407426	CA	M	2022	Zuriel	5
407427	CA	M	2022	Zylo	5

	Count
Name
Aadan	18
Aadarsh	6
Aaden	647
Aadhav	27
Aadhini	6
...	...
Zymir	5
Zyon	133
Zyra	103
Zyrah	21
Zyrus	5

	Count
Year
1910	9163
1911	9983
1912	17946
1913	22094
1914	26926
...	...
2018	395436
2019	386996
2020	362882
2021	362582
2022	360023

	Year	Candidate	Party	Popular vote	Result	%
129	1980	Jimmy Carter	Democratic	35480115	loss	41.132848
100	1948	Harry Truman	Democratic	24179347	win	49.601536
102	1948	Norman Thomas	Socialist	139569	loss	0.286312
99	1948	Claude A. Watson	Prohibition	103708	loss	0.212747
42	1888	Alson Streeter	Union Labor	146602	loss	1.288861

	Year	Candidate	Party	Popular vote	Result	%
23	1860	Abraham Lincoln	Republican	1855993	win	39.699408
24	1860	John Bell	Constitutional Union	590901	loss	12.639283
25	1860	John C. Breckinridge	Southern Democratic	848019	loss	18.138998
26	1860	Stephen A. Douglas	Northern Democratic	1380202	loss	29.522311
66	1912	Eugene V. Debs	Socialist	901551	loss	6.004354
67	1912	Eugene W. Chafin	Prohibition	208156	loss	1.386325
68	1912	Theodore Roosevelt	Progressive	4122721	loss	27.457433
69	1912	William Taft	Republican	3486242	loss	23.218466
70	1912	Woodrow Wilson	Democratic	6296284	win	41.933422
115	1968	George Wallace	American Independent	9901118	loss	13.571218

	Year	Candidate	Popular vote	Result	%
Party
American	1976	Thomas J. Anderson	873053	loss	21.554001
American Independent	1976	Lester Maddox	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2016	Michael Peroutka	203091	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	2020	Woodrow Wilson	81268924	win	61.344703
Democratic-Republican	1824	John Quincy Adams	151271	win	57.210122

	Year	Candidate	Party	Popular vote	Result	%
114	1964	Lyndon Johnson	Democratic	43127041	win	61.344703
91	1936	Franklin Roosevelt	Democratic	27752648	win	60.978107
120	1972	Richard Nixon	Republican	47168710	win	60.907806
79	1920	Warren Harding	Republican	16144093	win	60.574501
133	1984	Ronald Reagan	Republican	54455472	win	59.023326
84	1928	Herbert Hoover	Republican	21427123	win	58.368524
86	1932	Franklin Roosevelt	Democratic	22821277	win	57.672125
109	1956	Dwight Eisenhower	Republican	35579180	win	57.650654

	Year	Candidate	Popular vote	Result	%
Party
American	1856	Millard Fillmore	873053	loss	21.554001
American Independent	1968	George Wallace	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2008	Chuck Baldwin	199750	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	1964	Lyndon Johnson	43127041	win	61.344703
Democratic-Republican	1824	Andrew Jackson	151271	loss	57.210122
Dixiecrat	1948	Strom Thurmond	1175930	loss	2.412304
Farmer–Labor	1920	Parley P. Christensen	265398	loss	0.995804
Free Soil	1848	Martin Van Buren	291501	loss	10.138474
Green	2000	Ralph Nader	2882955	loss	2.741176
Greenback	1880	James B. Weaver	308649	loss	3.352344
Independent	1992	Ross Perot	19743821	loss	18.956298
Liberal Republican	1872	Horace Greeley	2834761	loss	44.071406
Libertarian	2016	Gary Johnson	4489235	loss	3.307714
National Democratic	1896	John M. Palmer	134645	loss	0.969566
National Republican	1828	John Quincy Adams	500897	loss	43.796073
National Union	1864	Abraham Lincoln	2211317	win	54.951512
Natural Law	1996	John Hagelin	113670	loss	0.118219
New Alliance	1988	Lenora Fulani	217221	loss	0.237804
Northern Democratic	1860	Stephen A. Douglas	1380202	loss	29.522311
Populist	1892	James B. Weaver	1041028	loss	8.645038
Progressive	1912	Theodore Roosevelt	4122721	loss	27.457433
Prohibition	1892	John Bidwell	270879	loss	2.249468
Reform	1996	Ross Perot	8085294	loss	8.408844
Republican	1972	Richard Nixon	47168710	win	60.907806
Socialist	1912	Eugene V. Debs	901551	loss	6.004354
Southern Democratic	1860	John C. Breckinridge	848019	loss	18.138998
States' Rights	1956	T. Coleman Andrews	107929	loss	0.174883
Taxpayers	1996	Howard Phillips	184656	loss	0.192045
Union	1936	William Lemke	892378	loss	1.960733
Union Labor	1888	Alson Streeter	146602	loss	1.288861
Whig	1840	William Henry Harrison	1275583	win	53.051213

	Year	Candidate	Party	Popular vote	Result	%
58	1904	Eugene V. Debs	Socialist	402810	loss	2.985897
62	1908	Eugene V. Debs	Socialist	420852	loss	2.850866
66	1912	Eugene V. Debs	Socialist	901551	loss	6.004354
71	1916	Allan L. Benson	Socialist	590524	loss	3.194193
76	1920	Eugene V. Debs	Socialist	913693	loss	3.428282
85	1928	Norman Thomas	Socialist	267478	loss	0.728623
88	1932	Norman Thomas	Socialist	884885	loss	2.236211
92	1936	Norman Thomas	Socialist	187910	loss	0.412876
95	1940	Norman Thomas	Socialist	116599	loss	0.234237
102	1948	Norman Thomas	Socialist	139569	loss	0.286312

		Count
Year	Sex
1910	F	5950
1910	M	3213
1911	F	6602
1911	M	3381
1912	F	9804
1912	M	8142

	letter	num	state
0	A	1.0	NaN
1	A	2.0	tx
2	B	3.0	fl
3	C	4.0	hi
4	C	NaN	NaN
5	C	4.0	ak

	num	state
letter
A	2	1
B	1	1
C	2	2

	Count		Name
Sex	F	M	F	M
Year
1910	295	237	Yvonne	William
1911	390	214	Zelma	Willis
1912	534	501	Yvonne	Woodrow
1913	584	614	Zelma	Yoshio
1914	773	769	Zelma	Yoshio
1915	998	1033	Zita	Yukio

	State	Sex	Year	Name	Count
235835	CA	F	2022	Olivia	2178
235836	CA	F	2022	Emma	2080
235837	CA	F	2022	Camila	2046
235838	CA	F	2022	Mia	1882
235839	CA	F	2022	Sophia	1762
235840	CA	F	2022	Isabella	1733
235841	CA	F	2022	Luna	1516
235842	CA	F	2022	Sofia	1307
235843	CA	F	2022	Amelia	1289
235844	CA	F	2022	Gianna	1107

	Year_x	Candidate	Party	Popular vote	Result	%	First Name	State	Sex	Year_y	Name	Count
75	1892	Benjamin Harrison	Republican	5176108	loss	42.984101	Benjamin	CA	M	2022	Benjamin	1524
73	1884	Benjamin Butler	Anti-Monopoly	134294	loss	1.335838	Benjamin	CA	M	2022	Benjamin	1524
74	1888	Benjamin Harrison	Republican	5443633	win	47.858041	Benjamin	CA	M	2022	Benjamin	1524
46	1880	James Garfield	Republican	4453337	win	48.369234	James	CA	M	2022	James	1086
44	1880	James B. Weaver	Greenback	308649	loss	3.352344	James	CA	M	2022	James	1086
...	...	...	...	...	...	...	...	...	...	...	...	...
115	1964	Lyndon Johnson	Democratic	43127041	win	61.344703	Lyndon	CA	M	2022	Lyndon	6
92	1912	Woodrow Wilson	Democratic	6296284	win	41.933422	Woodrow	CA	M	2022	Woodrow	6
93	1916	Woodrow Wilson	Democratic	9126868	win	49.367987	Woodrow	CA	M	2022	Woodrow	6
76	1888	Clinton B. Fisk	Prohibition	249819	loss	2.196299	Clinton	CA	M	2022	Clinton	6
145	2016	Darrell Castle	Constitution	203091	loss	0.149640	Darrell	CA	M	2022	Darrell	5

Lecture 4 – Data 100, Fall 2023¶

More on Groupby¶

Slido Exercise¶

Loading babynames Dataset¶

Case Study: Name "Popularity"¶

Slido Exercise¶

Slido Exercise¶

Slido Exercise¶

groupby.size and groupby.count()¶

Filtering by Group¶

groupby Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative Solutions¶

DataFrameGroupBy Objects¶

Pivot Tables¶

Groupby with multiple columns¶

pivot_table¶

pivot_table with Multiple values¶

Join Tables¶

More on `Groupby`¶

Loading `babynames` Dataset¶

`groupby.size` and `groupby.count()`¶

`groupby` Puzzle¶

`DataFrameGroupBy` Objects¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with Multiple values¶