import numpy as np
import pandas as pd
import plotly.express as px

# Loading the elections DataFrame
elections = pd.read_csv("data/elections.csv")

elections.head()

# Select the rows at positions 1, 2, and 3.
# Select the columns at positions 0, 1, and 2.
# Remember that Python indexing begins at position 0!
elections.iloc[[1, 2, 3], [0, 1, 2]]

# Index-based extraction using a list of rows and a slice of column indices
elections.iloc[[1, 2, 3], 0:3]

# Selecting all rows using a colon
elections.iloc[:, 0:3]

elections.iloc[[1, 2, 3], 1]

1    John Quincy Adams
2       Andrew Jackson
3    John Quincy Adams
Name: Candidate, dtype: object

# Extracting the value at row 0 and the second column
elections.iloc[0,1]

'Andrew Jackson'

elections[3:7]

elections[["Year", "Candidate", "Result"]]

elections["Candidate"]

0         Andrew Jackson
1      John Quincy Adams
2         Andrew Jackson
3      John Quincy Adams
4         Andrew Jackson
             ...        
177           Jill Stein
178         Joseph Biden
179         Donald Trump
180         Jo Jorgensen
181       Howard Hawkins
Name: Candidate, Length: 182, dtype: object

weird = pd.DataFrame({
    1:["topdog","botdog"], 
    "1":["topcat","botcat"]
})
weird

weird[1]

0    topdog
1    botdog
Name: 1, dtype: object

weird["1"]

0    topcat
1    botcat
Name: 1, dtype: object

weird[1:]

import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "data/babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'STATE.CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()

# Ask yourself: Why is :9 is the correct slice to select the first 10 rows?
babynames_first_10_rows = babynames.loc[:9, :]

babynames_first_10_rows

# Notice how we have exactly 10 elements in our boolean array argument.
babynames_first_10_rows[[True, False, True, False, True, False, True, False, True, False]]

# Or using .loc to filter a DataFrame by a Boolean array argument
babynames_first_10_rows.loc[[True, False, True, False, True, False, True, False, True, False], :]

# First, use a logical condition to generate a boolean Series
logical_operator = (babynames["Sex"] == "F")
logical_operator

0          True
1          True
2          True
3          True
4          True
          ...  
407423    False
407424    False
407425    False
407426    False
407427    False
Name: Sex, Length: 407428, dtype: bool

# Then, use this boolean Series to filter the DataFrame
babynames[logical_operator]

# Notice that we did not have to specify columns to select 
# If no columns are referenced, pandas will automatically select all columns
babynames.loc[babynames["Sex"] == "F"]

babynames[(babynames["Sex"] == "F") & (babynames["Year"] < 2000)]

babynames[(babynames["Sex"] == "F") | (babynames["Year"] < 2000)]

babynames.iloc[[0, 233, 484], [3, 4]]

babynames.loc[[0, 233, 484]]

babynames.loc[babynames["Count"] > 250, ["Name", "Count"]].head(3)

babynames.loc[babynames["Count"] > 250, ["Name", "Count"]].iloc[0:2, :]

# Note: The parentheses surrounding the code make it possible to break the code into multiple lines for readability

(
    babynames[(babynames["Name"] == "Bella") | 
              (babynames["Name"] == "Alex") |
              (babynames["Name"] == "Narges") |
              (babynames["Name"] == "Lisa")]
)

# A more concise method to achieve the above: .isin
names = ["Bella", "Alex", "Narges", "Lisa"]
display(babynames["Name"].isin(names))
display(babynames[babynames["Name"].isin(names)])

0         False
1         False
2         False
3         False
4         False
          ...  
407423    False
407424    False
407425    False
407426    False
407427    False
Name: Name, Length: 407428, dtype: bool

# What if we only want names that start with "N"?
display(babynames["Name"].str.startswith("N"))
display(babynames[babynames["Name"].str.startswith("N")])

0         False
1         False
2         False
3         False
4         False
          ...  
407423    False
407424    False
407425    False
407426    False
407427    False
Name: Name, Length: 407428, dtype: bool

# Create a Series of the length of each name
babyname_lengths = babynames["Name"].str.len()

# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babyname_lengths

babynames

# Modify the "name_lengths" column to be one less than its original value
babynames["name_lengths"] = babynames["name_lengths"] - 1
babynames

# Rename "name_lengths" to "Length"
babynames = babynames.rename(columns={"name_lengths":"Length"})
babynames

# Remove our new "Length" column
babynames = babynames.drop("Length", axis="columns")
babynames

yash_counts = babynames[babynames["Name"] == "Yash"]["Count"]
yash_counts

331824     8
334114     9
336390    11
338773    12
341387    10
343571    14
345767    24
348230    29
350889    24
353445    29
356221    25
358978    27
361831    29
364905    24
367867    23
370945    18
374055    14
376756    18
379660    18
383338     9
385903    12
388529    17
391485    16
394906    10
397874     9
400171    15
403092    13
406006    13
Name: Count, dtype: int64

# Average number of babies named Yash each year

np.mean(yash_counts)

17.142857142857142

# Max number of babies named Yash born in any single year

max(yash_counts)

29

# Returns the shape of the object in the format (num_rows, num_columns)
babynames.shape

(407428, 5)

# Returns the total number of entries in the object, equal to num_rows * num_columns
babynames.size

2037140

# What summary statistics can we describe?
babynames.describe()

# Our statistics are slightly different when working with a Series
babynames["Sex"].describe()

count     407428
unique         2
top            F
freq      239537
Name: Sex, dtype: object

# Randomly sample row(s) from the DataFrame
babynames.sample()

# Rerun this cell a few times – you'll get different results!
babynames.sample(5).iloc[:, 2:]

# Sampling with replacement
babynames[babynames["Year"] == 2000].sample(4, replace = True).iloc[:,2:]

# Count the number of times each unique value occurs in a Series
babynames["Name"].value_counts()

Name
Jean         223
Francis      221
Guadalupe    218
Jessie       217
Marion       214
            ... 
Renesme        1
Purity         1
Olanna         1
Nohea          1
Zayvier        1
Name: count, Length: 20437, dtype: int64

# Return an array of all unique values in the Series
babynames["Name"].unique()

array(['Mary', 'Helen', 'Dorothy', ..., 'Zae', 'Zai', 'Zayvier'],
      dtype=object)

# Sort a Series
babynames["Name"].sort_values()

366001      Aadan
384005      Aadan
369120      Aadan
398211    Aadarsh
370306      Aaden
           ...   
220691      Zyrah
197529      Zyrah
217429      Zyrah
232167      Zyrah
404544      Zyrus
Name: Name, Length: 407428, dtype: object

# Sort a DataFrame – there are lots of Michaels in California
babynames.sort_values(by = "Count", ascending = False)

# Create a Series of the length of each name
babyname_lengths = babynames["Name"].str.len()

# Add a column named "name_lengths" that includes the length of each name
babynames["name_lengths"] = babyname_lengths
babynames.head(5)

# Sort by the temporary column
babynames = babynames.sort_values(by = "name_lengths", ascending=False)
babynames.head(5)

# Drop the 'name_length' column
babynames = babynames.drop("name_lengths", axis = 'columns')
babynames.head(5)

babynames.sort_values("Name", key = lambda x:x.str.len(), ascending = False).head()

# First, define a function to count the number of times "dr" or "ea" appear in each name
def dr_ea_count(string):
    return string.count('dr') + string.count('ea')

# Then, use `map` to apply `dr_ea_count` to each name in the "Name" column
babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)

# Sort the DataFrame by the new "dr_ea_count" column so we can see our handiwork
babynames = babynames.sort_values(by = "dr_ea_count", ascending=False)
babynames.head()

# Drop the `dr_ea_count` column
babynames = babynames.drop("dr_ea_count", axis = 'columns')
babynames.head(5)

# The code below uses the full babynames dataset, which is why some numbers are different relative to the diagram
babynames[["Year", "Count"]].groupby("Year").agg(sum)

# What is the earliest year in which each name appeared?
babynames.groupby("Name")[["Year"]].agg(min)

# What is the largest single-year count of each name?
babynames.groupby("Name")[["Count"]].agg(max)

f_babynames = babynames[babynames["Sex"] == "F"]
f_babynames

# We sort the data by year
f_babynames = f_babynames.sort_values("Year")
f_babynames

# We'll talk about how to generate plots in a later lecture
fig = px.line(f_babynames[f_babynames["Name"] == "Jennifer"],
              x = "Year", y = "Count")
fig.update_layout(font_size = 18, 
                  autosize=False, 
                  width=1000, 
                  height=400)

# In the year with the highest Jennifer count, 6065 Jennifers were born
max_jenn = np.max(f_babynames[f_babynames["Name"] == "Jennifer"]["Count"])
max_jenn

6065

# Remember that we sorted f_babynames by year. 
# This means that grabbing the final entry gives us the most recent count of Jennifers: 114
# In 2022, the most recent year for which we have data, 114 Jennifers were born
curr_jenn = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"].iloc[-1]
curr_jenn

114

# Compute the RTP
curr_jenn / max_jenn

0.018796372629843364

def ratio_to_peak(series):
    """
    Compute the RTP for a Series containing the counts per year for a single name
    """
    return series.iloc[-1] / np.max(series)

# Construct a Series containing our Jennifer count data
jenn_counts_ser = f_babynames[f_babynames["Name"] == "Jennifer"]["Count"]

# Then, find the RTP
ratio_to_peak(jenn_counts_ser)

0.018796372629843364

rtp_table = f_babynames.groupby("Name")[["Year", "Count"]].agg(ratio_to_peak)
rtp_table

# Results in a TypeError
# rtp_table = f_babynames.groupby("Name").agg(ratio_to_peak)
# rtp_table

# Recompute the RTPs, but only performing the calculation on the "Count" column
rtp_table = f_babynames.groupby("Name")[["Count"]].agg(ratio_to_peak)
rtp_table

# Rename "Count" to "Count RTP" for clarity
rtp_table = rtp_table.rename(columns = {"Count": "Count RTP"})
rtp_table

# What name has fallen the most in popularity?
rtp_table.sort_values("Count RTP")

def plot_name(*names):
    fig = px.line(f_babynames[f_babynames["Name"].isin(names)], 
                  x = "Year", y = "Count", color="Name",
                  title=f"Popularity for: {names}")
    fig.update_layout(font_size = 18, 
                  autosize=False, 
                  width=1000, 
                  height=400)
    return fig

plot_name("Debra")

# Find the 10 names that have decreased the most in popularity
top10 = rtp_table.sort_values("Count RTP").head(10).index
top10

Index(['Debra', 'Debbie', 'Carol', 'Tammy', 'Susan', 'Cheryl', 'Shannon',
       'Tina', 'Michele', 'Terri'],
      dtype='object', name='Name')

plot_name(*top10)

	Year	Candidate	Party	Popular vote	Result	%
0	1824	Andrew Jackson	Democratic-Republican	151271	loss	57.210122
1	1824	John Quincy Adams	Democratic-Republican	113142	win	42.789878
2	1828	Andrew Jackson	Democratic	642806	win	56.203927
3	1828	John Quincy Adams	National Republican	500897	loss	43.796073
4	1832	Andrew Jackson	Democratic	702735	win	54.574789

	Year	Candidate	Party	Popular vote	Result	%
3	1828	John Quincy Adams	National Republican	500897	loss	43.796073
4	1832	Andrew Jackson	Democratic	702735	win	54.574789
5	1832	Henry Clay	National Republican	484205	loss	37.603628
6	1832	William Wirt	Anti-Masonic	100715	loss	7.821583

	State	Sex	Year	Name	Count
0	CA	F	1910	Mary	295
1	CA	F	1910	Helen	239
2	CA	F	1910	Dorothy	220
3	CA	F	1910	Margaret	163
4	CA	F	1910	Frances	134
5	CA	F	1910	Ruth	128
6	CA	F	1910	Evelyn	126
7	CA	F	1910	Alice	118
8	CA	F	1910	Virginia	101
9	CA	F	1910	Elizabeth	93

Symbol	Usage	Meaning
~	~p	Returns negation of p
\|	p \| q	p OR q
&	p & q	p AND q
^	p ^ q	p XOR q (exclusive or)

	Year	Count
count	407428.000000	407428.000000
mean	1985.733609	79.543456
std	27.007660	293.698654
min	1910.000000	5.000000
25%	1969.000000	7.000000
50%	1992.000000	13.000000
75%	2008.000000	38.000000
max	2022.000000	8260.000000

Lecture 3 – Data 100, Fall 2023¶

Data Extraction in `Pandas`¶

Integer-Based Extraction Using `iloc`¶

Context-dependent Extraction using `[]`¶

Slido Exercise¶

Dataset: California baby names¶

Conditional Selection¶

Bitwise Operators¶

Slido Exercises¶

`.isin` for Selection based on a list, array, or `Series`¶

`.str` Functions for Defining a Condition¶

Adding, Removing, and Modifying Columns¶

Add a Column¶

Modify a Column¶

Rename a Column Name¶

Delete a Column¶

Useful Utility Functions¶

`NumPy`¶

Built-In `pandas` Methods¶

Custom sorting¶

Approach 1: Create a temporary column¶

Approach 2: Sorting using the `key` argument¶

Approach 3: Sorting Using the `map` Function¶

Grouping¶

Case Study: Name "Popularity"¶

	State	Sex	Year	Name	Count
6289	CA	F	1923	Bella	5
7512	CA	F	1925	Bella	8
12368	CA	F	1932	Lisa	5
14741	CA	F	1936	Lisa	8
17084	CA	F	1939	Lisa	5
...	...	...	...	...	...
393248	CA	M	2018	Alex	495
396111	CA	M	2019	Alex	438
398983	CA	M	2020	Alex	379
401788	CA	M	2021	Alex	333
404663	CA	M	2022	Alex	344

	State	Sex	Year	Name	Count
76	CA	F	1910	Norma	23
83	CA	F	1910	Nellie	20
127	CA	F	1910	Nina	11
198	CA	F	1910	Nora	6
310	CA	F	1911	Nellie	23
...	...	...	...	...	...
407319	CA	M	2022	Nilan	5
407320	CA	M	2022	Niles	5
407321	CA	M	2022	Nolen	5
407322	CA	M	2022	Noriel	5
407323	CA	M	2022	Norris	5

	Year	Name	Count
92989	1983	Dayana	8
323162	1992	Kareem	27
58873	1969	Rebecca	1025
83706	1980	Marta	53
61539	1970	Rebeca	32

	Year	Name	Count
150031	2000	Savana	31
344662	2000	Caeden	5
344849	2000	Marquel	5
344046	2000	Eusebio	8

	State	Sex	Year	Name	Count
268041	CA	M	1957	Michael	8260
267017	CA	M	1956	Michael	8258
317387	CA	M	1990	Michael	8246
281850	CA	M	1969	Michael	8245
283146	CA	M	1970	Michael	8196
...	...	...	...	...	...
317292	CA	M	1989	Olegario	5
317291	CA	M	1989	Norbert	5
317290	CA	M	1989	Niles	5
317289	CA	M	1989	Nikola	5
407427	CA	M	2022	Zylo	5

	State	Sex	Year	Name	Count	name_lengths
334166	CA	M	1996	Franciscojavier	8	15
337301	CA	M	1997	Franciscojavier	5	15
339472	CA	M	1998	Franciscojavier	6	15
321792	CA	M	1991	Ryanchristopher	7	15
327358	CA	M	1993	Johnchristopher	5	15

	State	Sex	Year	Name	Count	dr_ea_count
115957	CA	F	1990	Deandrea	5	3
101976	CA	F	1986	Deandrea	6	3
131029	CA	F	1994	Leandrea	5	3
108731	CA	F	1988	Deandrea	5	3
308131	CA	M	1985	Deandrea	6	3

	Count
Year
1910	9163
1911	9983
1912	17946
1913	22094
1914	26926
...	...
2018	395436
2019	386996
2020	362882
2021	362582
2022	360023

	Year
Name
Aadan	2008
Aadarsh	2019
Aaden	2007
Aadhav	2014
Aadhini	2022
...	...
Zymir	2020
Zyon	1999
Zyra	2012
Zyrah	2011
Zyrus	2021

	State	Sex	Year	Name	Count
23	CA	F	1910	Bernice	59
219	CA	F	1910	Katharine	5
170	CA	F	1910	Clarice	7
26	CA	F	1910	Doris	56
30	CA	F	1910	Ethel	52
...	...	...	...	...	...
236551	CA	F	2022	Dani	47
235914	CA	F	2022	Iris	361
235918	CA	F	2022	Lucy	345
235973	CA	F	2022	Sage	226
239452	CA	F	2022	Rylin	5

	Year	Count
Name
Aadhini	1.0	1.000000
Aadhira	1.0	0.500000
Aadhya	1.0	0.660000
Aadya	1.0	0.586207
Aahana	1.0	0.269231
...	...	...
Zyanya	1.0	0.466667
Zyla	1.0	1.000000
Zylah	1.0	1.000000
Zyra	1.0	1.000000
Zyrah	1.0	0.833333

	Count RTP
Name
Debra	0.001260
Debbie	0.002815
Carol	0.003180
Tammy	0.003249
Susan	0.003305
...	...
Fidelia	1.000000
Naveyah	1.000000
Finlee	1.000000
Roseline	1.000000
Aadhini	1.000000

Lecture 3 – Data 100, Fall 2023¶

Data Extraction in Pandas¶

Integer-Based Extraction Using iloc¶

Context-dependent Extraction using []¶

Slido Exercise¶

Dataset: California baby names¶

Conditional Selection¶

Bitwise Operators¶

Slido Exercises¶

.isin for Selection based on a list, array, or Series¶

.str Functions for Defining a Condition¶

Adding, Removing, and Modifying Columns¶

Add a Column¶

Modify a Column¶

Rename a Column Name¶

Delete a Column¶

Useful Utility Functions¶

NumPy¶

Built-In pandas Methods¶

Custom sorting¶

Approach 1: Create a temporary column¶

Approach 2: Sorting using the key argument¶

Approach 3: Sorting Using the map Function¶

Grouping¶

Case Study: Name "Popularity"¶

Data Extraction in `Pandas`¶

Integer-Based Extraction Using `iloc`¶

Context-dependent Extraction using `[]`¶

`.isin` for Selection based on a list, array, or `Series`¶

`.str` Functions for Defining a Condition¶

`NumPy`¶

Built-In `pandas` Methods¶

Approach 2: Sorting using the `key` argument¶

Approach 3: Sorting Using the `map` Function¶