import numpy as np
import pandas as pd
import plotly.express as px


ds = pd.DataFrame(dict(x=[3,1,4,1,5,9,2,5,6], 
                      y=['ak', 'tx', 'fl', 'hi', 'mi', 'ak', 'ca', 'sd', 'nc']), 
                      index=list('ABCABCACB') )
ds


ds.groupby(ds.index).agg(max)


df = pd.DataFrame({'letter':['A','A','B','C','C','C'], 
                   'num':[1,2,3,4,np.NaN,4], 
                   'state':[np.NaN, 'tx', 'fl', 'hi', np.NaN, 'ak']})
df


df.groupby("letter").size()

letter
A    2
B    1
C    3
dtype: int64


df.groupby("letter").count()


df["letter"].value_counts()

C    3
A    2
B    1
Name: letter, dtype: int64


# Let's read the elections dataset
elections = pd.read_csv("data/elections.csv")
elections.sample(5)


elections.groupby("Year").filter(lambda sf: sf["%"].max() < 45).head(10)


elections.groupby("Party").agg(max).head(10)


elections_sorted_by_percent = elections.sort_values("%", ascending=False)
elections_sorted_by_percent.head(8)


elections_sorted_by_percent.groupby("Party").first()


elections.groupby("Party")["%"].idxmax()

Party
American                  22
American Independent     115
Anti-Masonic               6
Anti-Monopoly             38
Citizens                 127
Communist                 89
Constitution             164
Constitutional Union      24
Democratic               114
Democratic-Republican      0
Dixiecrat                103
Farmer–Labor              78
Free Soil                 15
Green                    155
Greenback                 35
Independent              143
Liberal Republican        31
Libertarian              175
National Democratic       50
National Republican        3
National Union            27
Natural Law              148
New Alliance             136
Northern Democratic       26
Populist                  48
Progressive               68
Prohibition               49
Reform                   150
Republican               120
Socialist                 66
Southern Democratic       25
States' Rights           110
Taxpayers                147
Union                     93
Union Labor               42
Whig                      11
Name: %, dtype: int64


# This is the computational part
best_per_party = elections.loc[elections.groupby("Party")["%"].idxmax()]

# This indexes by Party to match the formatting above
best_per_party.set_index('Party').sort_index().head()


best_per_party2 = elections.sort_values("%").drop_duplicates(["Party"], keep="last")
best_per_party2.set_index("Party").sort_index().head()  # Formatting


grouped_by_party = elections.groupby("Party")
type(grouped_by_party)

pandas.core.groupby.generic.DataFrameGroupBy


grouped_by_party.groups

{'American': [22, 126], 'American Independent': [115, 119, 124], 'Anti-Masonic': [6], 'Anti-Monopoly': [38], 'Citizens': [127], 'Communist': [89], 'Constitution': [160, 164, 172], 'Constitutional Union': [24], 'Democratic': [2, 4, 8, 10, 13, 14, 17, 20, 28, 29, 34, 37, 39, 45, 47, 52, 55, 57, 64, 70, 74, 77, 81, 83, 86, 91, 94, 97, 100, 105, 108, 111, 114, 116, 118, 123, 129, 134, 137, 140, 144, 151, 158, 162, 168, 176, 178], 'Democratic-Republican': [0, 1], 'Dixiecrat': [103], 'Farmer–Labor': [78], 'Free Soil': [15, 18], 'Green': [149, 155, 156, 165, 170, 177, 181], 'Greenback': [35], 'Independent': [121, 130, 143, 161, 167, 174], 'Liberal Republican': [31], 'Libertarian': [125, 128, 132, 138, 139, 146, 153, 159, 163, 169, 175, 180], 'National Democratic': [50], 'National Republican': [3, 5], 'National Union': [27], 'Natural Law': [148], 'New Alliance': [136], 'Northern Democratic': [26], 'Populist': [48, 61, 141], 'Progressive': [68, 82, 101, 107], 'Prohibition': [41, 44, 49, 51, 54, 59, 63, 67, 73, 75, 99], 'Reform': [150, 154], 'Republican': [21, 23, 30, 32, 33, 36, 40, 43, 46, 53, 56, 60, 65, 69, 72, 79, 80, 84, 87, 90, 96, 98, 104, 106, 109, 112, 113, 117, 120, 122, 131, 133, 135, 142, 145, 152, 157, 166, 171, 173, 179], 'Socialist': [58, 62, 66, 71, 76, 85, 88, 92, 95, 102], 'Southern Democratic': [25], 'States' Rights': [110], 'Taxpayers': [147], 'Union': [93], 'Union Labor': [42], 'Whig': [7, 9, 11, 12, 16, 19]}


grouped_by_party.get_group("Socialist")


import urllib.request
import os.path

# Download data from the web directly
data_url = "https://www.ssa.gov/oact/babynames/names.zip"
local_filename = "data/babynames.zip"
if not os.path.exists(local_filename): # if the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

        
# Load data without unzipping the file
import zipfile
babynames = [] 
with zipfile.ZipFile(local_filename, "r") as zf:
    data_files = [f for f in zf.filelist if f.filename[-3:] == "txt"]
    def extract_year_from_filename(fn):
        return int(fn[3:7])
    for f in data_files:
        year = extract_year_from_filename(f.filename)
        with zf.open(f) as fp:
            df = pd.read_csv(fp, names=["Name", "Sex", "Count"])
            df["Year"] = year
            babynames.append(df)
babynames = pd.concat(babynames)


babynames.head() # show the first few rows


babynames.groupby(["Year", "Sex"]).sum().head(6)


babynames.pivot_table(
    index = "Year", 
    columns = "Sex", 
    values = "Count", 
    aggfunc = np.sum).head(6)


babynames.pivot_table(
    index = "Year", 
    columns = "Sex", 
    values = ["Count", "Name"], 
    aggfunc = np.max).head(6)


elections.head(10)


babynames_2020 = babynames[babynames["Year"] == 2020]
babynames_2020.head(10)


elections["First Name"] = elections["Candidate"].str.split(" ").str[0]
elections


merged = pd.merge(left = elections, right = babynames_2020, 
                  left_on = "First Name", right_on = "Name")
merged


merged.sort_values("Count", ascending = False)

	Year	Candidate	Party	Popular vote	Result	%
64	1908	William Jennings Bryan	Democratic	6408979	loss	43.414640
90	1936	Alf Landon	Republican	16679543	loss	36.648285
111	1960	John Kennedy	Democratic	34220984	win	50.082561
18	1852	John P. Hale	Free Soil	155210	loss	4.930283
42	1888	Alson Streeter	Union Labor	146602	loss	1.288861

	Year	Candidate	Party	Popular vote	Result	%
23	1860	Abraham Lincoln	Republican	1855993	win	39.699408
24	1860	John Bell	Constitutional Union	590901	loss	12.639283
25	1860	John C. Breckinridge	Southern Democratic	848019	loss	18.138998
26	1860	Stephen A. Douglas	Northern Democratic	1380202	loss	29.522311
66	1912	Eugene V. Debs	Socialist	901551	loss	6.004354
67	1912	Eugene W. Chafin	Prohibition	208156	loss	1.386325
68	1912	Theodore Roosevelt	Progressive	4122721	loss	27.457433
69	1912	William Taft	Republican	3486242	loss	23.218466
70	1912	Woodrow Wilson	Democratic	6296284	win	41.933422
115	1968	George Wallace	American Independent	9901118	loss	13.571218

	Year	Candidate	Popular vote	Result	%
Party
American	1976	Thomas J. Anderson	873053	loss	21.554001
American Independent	1976	Lester Maddox	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2016	Michael Peroutka	203091	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	2020	Woodrow Wilson	81268924	win	61.344703
Democratic-Republican	1824	John Quincy Adams	151271	win	57.210122

	Year	Candidate	Party	Popular vote	Result	%
114	1964	Lyndon Johnson	Democratic	43127041	win	61.344703
91	1936	Franklin Roosevelt	Democratic	27752648	win	60.978107
120	1972	Richard Nixon	Republican	47168710	win	60.907806
79	1920	Warren Harding	Republican	16144093	win	60.574501
133	1984	Ronald Reagan	Republican	54455472	win	59.023326
84	1928	Herbert Hoover	Republican	21427123	win	58.368524
86	1932	Franklin Roosevelt	Democratic	22821277	win	57.672125
109	1956	Dwight Eisenhower	Republican	35579180	win	57.650654

	Year	Candidate	Popular vote	Result	%
Party
American	1856	Millard Fillmore	873053	loss	21.554001
American Independent	1968	George Wallace	9901118	loss	13.571218
Anti-Masonic	1832	William Wirt	100715	loss	7.821583
Anti-Monopoly	1884	Benjamin Butler	134294	loss	1.335838
Citizens	1980	Barry Commoner	233052	loss	0.270182
Communist	1932	William Z. Foster	103307	loss	0.261069
Constitution	2008	Chuck Baldwin	199750	loss	0.152398
Constitutional Union	1860	John Bell	590901	loss	12.639283
Democratic	1964	Lyndon Johnson	43127041	win	61.344703
Democratic-Republican	1824	Andrew Jackson	151271	loss	57.210122
Dixiecrat	1948	Strom Thurmond	1175930	loss	2.412304
Farmer–Labor	1920	Parley P. Christensen	265398	loss	0.995804
Free Soil	1848	Martin Van Buren	291501	loss	10.138474
Green	2000	Ralph Nader	2882955	loss	2.741176
Greenback	1880	James B. Weaver	308649	loss	3.352344
Independent	1992	Ross Perot	19743821	loss	18.956298
Liberal Republican	1872	Horace Greeley	2834761	loss	44.071406
Libertarian	2016	Gary Johnson	4489235	loss	3.307714
National Democratic	1896	John M. Palmer	134645	loss	0.969566
National Republican	1828	John Quincy Adams	500897	loss	43.796073
National Union	1864	Abraham Lincoln	2211317	win	54.951512
Natural Law	1996	John Hagelin	113670	loss	0.118219
New Alliance	1988	Lenora Fulani	217221	loss	0.237804
Northern Democratic	1860	Stephen A. Douglas	1380202	loss	29.522311
Populist	1892	James B. Weaver	1041028	loss	8.645038
Progressive	1912	Theodore Roosevelt	4122721	loss	27.457433
Prohibition	1892	John Bidwell	270879	loss	2.249468
Reform	1996	Ross Perot	8085294	loss	8.408844
Republican	1972	Richard Nixon	47168710	win	60.907806
Socialist	1912	Eugene V. Debs	901551	loss	6.004354
Southern Democratic	1860	John C. Breckinridge	848019	loss	18.138998
States' Rights	1956	T. Coleman Andrews	107929	loss	0.174883
Taxpayers	1996	Howard Phillips	184656	loss	0.192045
Union	1936	William Lemke	892378	loss	1.960733
Union Labor	1888	Alson Streeter	146602	loss	1.288861
Whig	1840	William Henry Harrison	1275583	win	53.051213

Pandas, Part III¶

More on `GroupBy`¶

Quick review question¶

`groupby.size` and `groupby.count()`¶

Filtering by Group¶

`groupby` Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative solutions¶

`DataFrameGroupBy` Objects¶

Pivot Tables¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with multiple values¶

Join Tables¶

	Year	Candidate	Party	Popular vote	Result	%
58	1904	Eugene V. Debs	Socialist	402810	loss	2.985897
62	1908	Eugene V. Debs	Socialist	420852	loss	2.850866
66	1912	Eugene V. Debs	Socialist	901551	loss	6.004354
71	1916	Allan L. Benson	Socialist	590524	loss	3.194193
76	1920	Eugene V. Debs	Socialist	913693	loss	3.428282
85	1928	Norman Thomas	Socialist	267478	loss	0.728623
88	1932	Norman Thomas	Socialist	884885	loss	2.236211
92	1936	Norman Thomas	Socialist	187910	loss	0.412876
95	1940	Norman Thomas	Socialist	116599	loss	0.234237
102	1948	Norman Thomas	Socialist	139569	loss	0.286312

	Name	Sex	Count	Year
0	Mary	F	7065	1880
1	Anna	F	2604	1880
2	Emma	F	2003	1880
3	Elizabeth	F	1939	1880
4	Minnie	F	1746	1880

		Count
Year	Sex
1880	F	90994
1880	M	110490
1881	F	91953
1881	M	100737
1882	F	107847
1882	M	113686

	Name	Sex	Count	Year
0	Olivia	F	17664	2020
1	Emma	F	15680	2020
2	Ava	F	13179	2020
3	Charlotte	F	13083	2020
4	Sophia	F	13070	2020
5	Amelia	F	12780	2020
6	Isabella	F	12182	2020
7	Mia	F	11243	2020
8	Evelyn	F	9503	2020
9	Harper	F	8834	2020

	letter	num	state
0	A	1.0	NaN
1	A	2.0	tx
2	B	3.0	fl
3	C	4.0	hi
4	C	NaN	NaN
5	C	4.0	ak

	num	state
letter
A	2	1
B	1	1
C	2	2

	Count		Name
Sex	F	M	F	M
Year
1880	7065	9655	Zula	Zeke
1881	6919	8769	Zula	Zeb
1882	8148	9557	Zula	Zed
1883	8012	8894	Zula	Zeno
1884	9217	9388	Zula	Zollie
1885	9128	8756	Zula	Zollie

Pandas, Part III¶

More on GroupBy¶

Quick review question¶

groupby.size and groupby.count()¶

Filtering by Group¶

groupby Puzzle¶

Attempt #1¶

Attempt #2¶

Alternative solutions¶

DataFrameGroupBy Objects¶

Pivot Tables¶

Groupby with multiple columns¶

pivot_table¶

pivot_table with multiple values¶

Join Tables¶

More on `GroupBy`¶

`groupby.size` and `groupby.count()`¶

`groupby` Puzzle¶

`DataFrameGroupBy` Objects¶

`Groupby` with multiple columns¶

`pivot_table`¶

`pivot_table` with multiple values¶