import numpy as np
import pandas as pd
import plotly.express as px


# Let's read the elections dataset
elections = pd.read_csv("data/elections.csv")
elections.sample(5)


elections.groupby("Party").agg(max).head(10)


ds = pd.DataFrame(dict(x=[3,1,4,1,5,9,2,5,6], 
                      y=['ak', 'tx', 'fl', 'hi', 'mi', 'ak', 'ca', 'sd', 'nc']), 
                      index=list('ABCABCACB') )
ds


ds.groupby(ds.index).agg(max)


elections_sorted_by_percent = elections.sort_values("%", ascending=False)
elections_sorted_by_percent.head(5)


elections_sorted_by_percent = elections.sort_values("%", ascending=False)
elections_sorted_by_percent.groupby("Party").agg(lambda x : x.iloc[0]).head(9)


elections_sorted_by_percent = elections.sort_values("%", ascending=False)
elections_sorted_by_percent.groupby("Party").first().head(9)


elections.head()


elections.groupby("Party")["%"].idxmax()

Party
American                  22
American Independent     115
Anti-Masonic               6
Anti-Monopoly             38
Citizens                 127
Communist                 89
Constitution             164
Constitutional Union      24
Democratic               114
Democratic-Republican      0
Dixiecrat                103
Farmer–Labor              78
Free Soil                 15
Green                    155
Greenback                 35
Independent              143
Liberal Republican        31
Libertarian              175
National Democratic       50
National Republican        3
National Union            27
Natural Law              148
New Alliance             136
Northern Democratic       26
Populist                  48
Progressive               68
Prohibition               49
Reform                   150
Republican               120
Socialist                 66
Southern Democratic       25
States' Rights           110
Taxpayers                147
Union                     93
Union Labor               42
Whig                      11
Name: %, dtype: int64


# This is the computational part
best_per_party = elections.loc[elections.groupby("Party")["%"].idxmax()]

# This indexes by Party to match the formatting above
best_per_party.set_index('Party').sort_index().head()


best_per_party2 = elections.sort_values("%").drop_duplicates(["Party"], keep="last")
best_per_party2.set_index("Party").sort_index().head()  # Formatting


elections.head()


grouped_by_year = elections.groupby("Year")
type(grouped_by_year)

pandas.core.groupby.generic.DataFrameGroupBy


grouped_by_year.groups.keys()

dict_keys([1824, 1828, 1832, 1836, 1840, 1844, 1848, 1852, 1856, 1860, 1864, 1868, 1872, 1876, 1880, 1884, 1888, 1892, 1896, 1900, 1904, 1908, 1912, 1916, 1920, 1924, 1928, 1932, 1936, 1940, 1944, 1948, 1952, 1956, 1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2020])


grouped_by_year.groups[2020]

Int64Index([178, 179, 180, 181], dtype='int64')


grouped_by_year.get_group(2020)


elections.groupby("Party")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff1b381ceb0>


# size returns a Series giving the size of each group
elections.groupby("Party").size().head(15)

Party
American                  2
American Independent      3
Anti-Masonic              1
Anti-Monopoly             1
Citizens                  1
Communist                 1
Constitution              3
Constitutional Union      1
Democratic               47
Democratic-Republican     2
Dixiecrat                 1
Farmer–Labor              1
Free Soil                 2
Green                     7
Greenback                 1
dtype: int64


# filter gives a copy of the original DataFrame where row r is included
# if its group obeys the given condition
#
# Note: Filtering is done per GROUP, not per ROW.
elections.groupby("Year").filter(lambda sf: sf["%"].max() < 45).set_index('Year').sort_index()


# The code below lets us peek into the groups and see why they were rejected or not
for i, (n, g) in enumerate(elections.groupby("Party")):
    print(n)
    display(g)
    if i>3: break

American

American Independent

Anti-Masonic

Anti-Monopoly

Citizens


import urllib.request
import os.path

# Download data from the web directly
data_url = "https://www.ssa.gov/oact/babynames/names.zip"
local_filename = "data/babynames.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

        
# Load data without unzipping the file
import zipfile
babynames = [] 
with zipfile.ZipFile(local_filename, "r") as zf:
    data_files = [f for f in zf.filelist if f.filename[-3:] == "txt"]
    def extract_year_from_filename(fn):
        return int(fn[3:7])
    for f in data_files:
        year = extract_year_from_filename(f.filename)
        with zf.open(f) as fp:
            df = pd.read_csv(fp, names=["Name", "Sex", "Count"])
            df["Year"] = year
            babynames.append(df)
babynames = pd.concat(babynames)


babynames.head() # show the first few rows


babynames.groupby("Year").sum()


babynames.head(10)


babynames.groupby(["Year", "Sex"]).agg(sum).head(6)


babynames.groupby(["Sex", "Year"]).agg(sum).head(6)


babynames.head(5)


babynames_pivot = babynames.pivot_table(
    index="Year",     # the rows (turned into index)
    columns="Sex",    # the column values
    values=["Count"], # the field(s) to processed in each group
    aggfunc=np.sum,   # group operation
)
babynames_pivot.head(6)


babynames_pivot = babynames.pivot_table(
    index="Year",     # the rows (turned into index)
    columns="Sex",    # the column values
    values=["Count", "Name"], 
    aggfunc=np.max,   # group operation
)
babynames_pivot.head(6)


elections


babynames_2020 = babynames[babynames["Year"] == 2020]
babynames_2020


elections["First Name"] = elections["Candidate"].str.split().str[0]


elections


merged = pd.merge(left = elections, right = babynames_2020, 
                  left_on = "First Name", right_on = "Name")
merged


merged.sort_values("Count")

	Year	Candidate	Party	Popular vote	Result	%
144	1996	Bill Clinton	Democratic	47400125	win	49.296938
152	2000	George W. Bush	Republican	50456002	win	47.974666
9	1836	William Henry Harrison	Whig	550816	loss	37.721543
35	1880	James B. Weaver	Greenback	308649	loss	3.352344
172	2016	Darrell Castle	Constitution	203091	loss	0.149640

	Year	Candidate	Popular vote	Result	%
Party
American	1976	Thomas J. Anderson	873053	loss	21.554001
American Independent	1976	Lester Maddox	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2016	Michael Peroutka	203091	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	2020	Woodrow Wilson	81268924	win	61.344703
Democratic-Republican	1824	John Quincy Adams	151271	win	57.210122

	Year	Candidate	Party	Popular vote	Result	%
114	1964	Lyndon Johnson	Democratic	43127041	win	61.344703
91	1936	Franklin Roosevelt	Democratic	27752648	win	60.978107
120	1972	Richard Nixon	Republican	47168710	win	60.907806
79	1920	Warren Harding	Republican	16144093	win	60.574501
133	1984	Ronald Reagan	Republican	54455472	win	59.023326

	Year	Candidate	Popular vote	Result	%
Party
American	1856	Millard Fillmore	873053	loss	21.554001
American Independent	1968	George Wallace	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2008	Chuck Baldwin	199750	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	1964	Lyndon Johnson	43127041	win	61.344703

	Year	Candidate	Popular vote	Result	%
Party
American	1856	Millard Fillmore	873053	loss	21.554001
American Independent	1968	George Wallace	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2008	Chuck Baldwin	199750	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	1964	Lyndon Johnson	43127041	win	61.344703

Introduction to Pandas, Part 3¶

Some Additional Groupby Puzzles¶

groupby Puzzle #1¶

Quick Subpuzzle¶

Completing groupby puzzle #1¶

Other DataFrameGroupBy Features¶

groupby.size()¶

groupby.filter()¶

Puzzle 2: Finding the number of babies born in each year of each sex.¶

Dataset - US Babynames¶

With `groupby`¶

With Pivot Tables¶

Merging Tables¶

	Year	Candidate	Party	Popular vote	Result	%
0	1824	Andrew Jackson	Democratic-Republican	151271	loss	57.210122
1	1824	John Quincy Adams	Democratic-Republican	113142	win	42.789878
2	1828	Andrew Jackson	Democratic	642806	win	56.203927
3	1828	John Quincy Adams	National Republican	500897	loss	43.796073
4	1832	Andrew Jackson	Democratic	702735	win	54.574789

	Year	Candidate	Party	Popular vote	Result	%
178	2020	Joseph Biden	Democratic	81268924	win	51.311515
179	2020	Donald Trump	Republican	74216154	loss	46.858542
180	2020	Jo Jorgensen	Libertarian	1865724	loss	1.177979
181	2020	Howard Hawkins	Green	405035	loss	0.255731

	Candidate	Party	Popular vote	Result	%
Year
1860	Abraham Lincoln	Republican	1855993	win	39.699408
1860	John Bell	Constitutional Union	590901	loss	12.639283
1860	John C. Breckinridge	Southern Democratic	848019	loss	18.138998
1860	Stephen A. Douglas	Northern Democratic	1380202	loss	29.522311
1912	Eugene V. Debs	Socialist	901551	loss	6.004354
1912	Eugene W. Chafin	Prohibition	208156	loss	1.386325
1912	Theodore Roosevelt	Progressive	4122721	loss	27.457433
1912	William Taft	Republican	3486242	loss	23.218466
1912	Woodrow Wilson	Democratic	6296284	win	41.933422
1968	George Wallace	American Independent	9901118	loss	13.571218
1968	Hubert Humphrey	Democratic	31271839	loss	42.863537
1968	Richard Nixon	Republican	31783783	win	43.565246
1992	Andre Marrou	Libertarian	290087	loss	0.278516
1992	Bill Clinton	Democratic	44909806	win	43.118485
1992	Bo Gritz	Populist	106152	loss	0.101918
1992	George H. W. Bush	Republican	39104550	loss	37.544784
1992	Ross Perot	Independent	19743821	loss	18.956298

	Name	Sex	Count	Year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

	Count
Year
1880	201484
1881	192690
1882	221533
1883	216944
1884	243461
...	...
2017	3570234
2018	3508655
2019	3461444
2020	3327419
2021	3361375

		Count
Year	Sex
1880	F	90994
1880	M	110490
1881	F	91953
1881	M	100737
1882	F	107847
1882	M	113686

	Name	Sex	Count	Year
0	Olivia	F	17641	2020
1	Emma	F	15656	2020
2	Ava	F	13160	2020
3	Charlotte	F	13065	2020
4	Sophia	F	13036	2020
...	...	...	...	...
31448	Zykell	M	5	2020
31449	Zylus	M	5	2020
31450	Zymari	M	5	2020
31451	Zyn	M	5	2020
31452	Zyran	M	5	2020

	Count		Name
Sex	F	M	F	M
Year
1880	7065	9655	Zula	Zeke
1881	6919	8769	Zula	Zeb
1882	8148	9557	Zula	Zed
1883	8012	8894	Zula	Zeno
1884	9217	9388	Zula	Zollie
1885	9128	8756	Zula	Zollie

Introduction to Pandas, Part 3¶

Some Additional Groupby Puzzles¶

groupby Puzzle #1¶

Quick Subpuzzle¶

Completing groupby puzzle #1¶

Other DataFrameGroupBy Features¶

groupby.size()¶

groupby.filter()¶

Puzzle 2: Finding the number of babies born in each year of each sex.¶

Dataset - US Babynames¶

With groupby¶

With Pivot Tables¶

Merging Tables¶

With `groupby`¶