import numpy as np
import pandas as pd

import urllib.request
import os.path
import zipfile

data_url = "https://www.ssa.gov/oact/babynames/state/namesbystate.zip"
local_filename = "data/babynamesbystate.zip"
if not os.path.exists(local_filename): # If the data exists don't download again
    with urllib.request.urlopen(data_url) as resp, open(local_filename, 'wb') as f:
        f.write(resp.read())

zf = zipfile.ZipFile(local_filename, 'r')

ca_name = 'STATE.CA.TXT'
field_names = ['State', 'Sex', 'Year', 'Name', 'Count']
with zf.open(ca_name) as fh:
    babynames = pd.read_csv(fh, header=None, names=field_names)

babynames.head()

# Ask yourself: Why is :9 is the correct slice to select the first 10 rows?
# Is this the correct slice if we used context-based extraction?
# What about .iloc? Try it!
babynames_first_10_rows = babynames.loc[:9, :]

babynames_first_10_rows

# Notice how we have exactly 10 elements in our boolean array argument.
babynames_first_10_rows[[True, False, True, False, True, False, True, False, True, False]]

# Or using .loc to filter a DataFrame by a Boolean array argument.
babynames_first_10_rows.loc[[True, False, True, False, True, False, True, False, True, False], :]

# First, use a logical condition to generate a boolean Series
logical_operator = (babynames["Sex"]=="F")
logical_operator

0          True
1          True
2          True
3          True
4          True
          ...  
407423    False
407424    False
407425    False
407426    False
407427    False
Name: Sex, Length: 407428, dtype: bool

# Then, use this boolean Series to filter the DataFrame
babynames[logical_operator]

# Notice that we did not have to specify columns to select 
# If no columns are referenced, pandas will automatically select all columns
babynames.loc[babynames["Sex"]=="F"]

babynames[(babynames["Sex"]=="F") & (babynames["Year"]<2000)]

babynames[(babynames["Sex"]=="F") | (babynames["Year"]<2000)]

babynames.iloc[[0, 233, 484], [3, 4]]

babynames.loc[[0, 233, 484]]

babynames.loc[babynames["Count"] > 250, ["Name", "Count"]].head(3)

babynames.loc[babynames["Count"] > 250, ["Name", "Count"]].iloc[0:2, :]

# Note: The parentheses surrounding the code make it possible to 
# break the code into multiple lines for readability

(
    babynames[(babynames["Name"]=="Bella") | 
              (babynames["Name"]=="Alex") |
              (babynames["Name"]=="Narges") |
              (babynames["Name"]=="Lisa")]
)

# A more concise method to achieve the above: .isin
names = ["Bella", "Alex", "Narges", "Lisa"]
display(babynames["Name"].isin(names))
display(babynames[babynames["Name"].isin(names)])

0         False
1         False
2         False
3         False
4         False
          ...  
407423    False
407424    False
407425    False
407426    False
407427    False
Name: Name, Length: 407428, dtype: bool

# What if we only want names that start with "N"?
display(babynames["Name"].str.startswith("N"))

display(babynames[babynames["Name"].str.startswith("N")])

0         False
1         False
2         False
3         False
4         False
          ...  
407423    False
407424    False
407425    False
407426    False
407427    False
Name: Name, Length: 407428, dtype: bool

# Create a Series of the length of each name.
babyname_lengths = babynames["Name"].str.len()

# Add a column named "name_lengths" that includes the length of each name.
babynames["name_lengths"] = babyname_lengths

babynames

# Modify the "name_lengths" column to be one less than its original value.
babynames["name_lengths"] = babynames["name_lengths"] - 1
babynames

# Rename "name_lengths" to "Length".
babynames = babynames.rename(columns={"name_lengths":"Length"})
babynames

# Remove our new "Length" column.
babynames = babynames.drop("Length", axis="columns")
babynames

df = pd.DataFrame({
    "a": ['c', 'b', 'a'],
    "b": ['b', 'c', 'a'],
    "c": ['c', 'a', 'b']
})
df.index = ['c','b','a']
df

df.drop('b')['a'][1]

/tmp/ipykernel_157/1525527017.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  df.drop('b')['a'][1]

'a'

df[df['a'].isin(['b','c'])].iloc[0,1]

'b'

df.rename(columns={'b':'a', 'a':'b'}).loc['a','b']

'a'

yash_counts = babynames[babynames["Name"]=="Yash"]["Count"]
yash_counts

331824     8
334114     9
336390    11
338773    12
341387    10
343571    14
345767    24
348230    29
350889    24
353445    29
356221    25
358978    27
361831    29
364905    24
367867    23
370945    18
374055    14
376756    18
379660    18
383338     9
385903    12
388529    17
391485    16
394906    10
397874     9
400171    15
403092    13
406006    13
Name: Count, dtype: int64

# Average number of babies named Yash each year
np.mean(yash_counts)

17.142857142857142

# Max number of babies named Yash born in any single year
max(yash_counts)

29

# Returns the shape of the object in the format (num_rows, num_columns)
babynames.shape

(407428, 5)

# Returns the total number of entries in the object, equal to num_rows * num_columns
babynames.size

2037140

# What summary statistics can we describe?
babynames.describe()

# Our statistics are slightly different when working with a Series.
babynames["Sex"].describe()

count     407428
unique         2
top            F
freq      239537
Name: Sex, dtype: object

# Randomly sample row(s) from the DataFrame.
babynames.sample()

# Rerun this cell a few times – you'll get different results!
babynames.sample(5).iloc[:, 2:]

# Sampling with replacement.
babynames[babynames["Year"]==2000].sample(4, replace=True).iloc[:,2:]

# Count the number of times each unique value occurs in a Series.
babynames["Name"].value_counts()

Name
Jean         223
Francis      221
Guadalupe    218
Jessie       217
Marion       214
            ... 
Ozair          1
Rahm           1
Ruhan          1
Sufiyan        1
Theon          1
Name: count, Length: 20437, dtype: int64

# Return an array of all unique values in the Series.
babynames["Name"].unique()

array(['Mary', 'Helen', 'Dorothy', ..., 'Zae', 'Zai', 'Zayvier'],
      dtype=object)

# Sort a Series. Notice that the index changes order, too!
babynames["Name"].sort_values()

366001      Aadan
384005      Aadan
369120      Aadan
398211    Aadarsh
370306      Aaden
           ...   
220691      Zyrah
197529      Zyrah
217429      Zyrah
232167      Zyrah
404544      Zyrus
Name: Name, Length: 407428, dtype: object

# Sort a DataFrame – there are lots of Michaels in California.
babynames.sort_values(by="Count", ascending=False)

# Sample DataFrame for experimentation!
df = pd.DataFrame({
    "A": [2, 3, 4],
    "B": [3, 5, 5],
})
df

df[df['A'] >= df['B'].min()]

df[df['A'] >= df['B'].sort_values(ascending=False)[0]]

df[df['A'] >= df['B'].value_counts().tail(n=1)]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[43], line 1
----> 1 df[df['A'] >= df['B'].value_counts().tail(n=1)]

File /srv/conda/envs/notebook/lib/python3.11/site-packages/pandas/core/ops/common.py:76, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
     72             return NotImplemented
     74 other = item_from_zerodim(other)
---> 76 return method(self, other)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/pandas/core/arraylike.py:60, in OpsMixin.__ge__(self, other)
     58 @unpack_zerodim_and_defer("__ge__")
     59 def __ge__(self, other):
---> 60     return self._cmp_method(other, operator.ge)

File /srv/conda/envs/notebook/lib/python3.11/site-packages/pandas/core/series.py:6125, in Series._cmp_method(self, other, op)
   6122 res_name = ops.get_op_result_name(self, other)
   6124 if isinstance(other, Series) and not self._indexed_same(other):
-> 6125     raise ValueError("Can only compare identically-labeled Series objects")
   6127 lvalues = self._values
   6128 rvalues = extract_array(other, extract_numpy=True, extract_range=True)

ValueError: Can only compare identically-labeled Series objects

# Create a Series of the length of each name.
babyname_lengths = babynames["Name"].str.len()

# Add a column named "name_lengths" that includes the length of each name.
babynames["name_lengths"] = babyname_lengths
babynames.head(5)

# Sort by the temporary column.
babynames = babynames.sort_values(by="name_lengths", ascending=False)
babynames.head(5)

# Drop the 'name_length' column.
babynames = babynames.drop("name_lengths", axis="columns")
babynames.head(5)

babynames.sort_values("Name", key=lambda x:x.str.len(), ascending=False).head()

# First, define a function to count 
# the number of times "dr" or "ea" appear in each name.
def dr_ea_count(string):
    return string.count('dr') + string.count('ea')

# Then, use `map` to apply `dr_ea_count` to each name in the "Name" column.
babynames["dr_ea_count"] = babynames["Name"].map(dr_ea_count)

# Sort the DataFrame by the new "dr_ea_count" column 
# so we can see our handiwork.
babynames = babynames.sort_values(by="dr_ea_count", ascending=False)
babynames.head()

# Drop the `dr_ea_count` column.
babynames = babynames.drop("dr_ea_count", axis="columns")
babynames.head(5)

	State	Sex	Year	Name	Count
0	CA	F	1910	Mary	295
1	CA	F	1910	Helen	239
2	CA	F	1910	Dorothy	220
3	CA	F	1910	Margaret	163
4	CA	F	1910	Frances	134
5	CA	F	1910	Ruth	128
6	CA	F	1910	Evelyn	126
7	CA	F	1910	Alice	118
8	CA	F	1910	Virginia	101
9	CA	F	1910	Elizabeth	93

Symbol	Usage	Meaning
~	~p	Returns negation of p
\|	p \| q	p OR q
&	p & q	p AND q
^	p ^ q	p XOR q (exclusive or)

	Year	Count
count	407428.000000	407428.000000
mean	1985.733609	79.543456
std	27.007660	293.698654
min	1910.000000	5.000000
25%	1969.000000	7.000000
50%	1992.000000	13.000000
75%	2008.000000	38.000000
max	2022.000000	8260.000000

	Year	Name	Count
372904	2011	Kayden	293
176211	2006	Arianah	5
226580	2019	Blessing	14
290319	1975	Guy	48
214723	2016	India	26

	Year	Name	Count
152755	2000	Stephane	5
342545	2000	Charles	565
151059	2000	Tahlia	11
344003	2000	Brallan	8

Lecture 3 – Data 100, Summer 2025¶

Dataset: California baby names¶

Conditional Selection¶

Slido Exercise¶

Adding, Removing, and Modifying Columns¶

Slido Exercise¶

Useful Utility Functions¶

`NumPy`¶

Built-In `pandas` Methods¶

Slido Exercise¶

Custom sorting¶

Approach 1: Create a temporary column¶

Approach 2: Sorting using the `key` argument¶

Approach 3: Sorting Using the `map` Function¶

	State	Sex	Year	Name	Count
6289	CA	F	1923	Bella	5
7512	CA	F	1925	Bella	8
12368	CA	F	1932	Lisa	5
14741	CA	F	1936	Lisa	8
17084	CA	F	1939	Lisa	5
...	...	...	...	...	...
393248	CA	M	2018	Alex	495
396111	CA	M	2019	Alex	438
398983	CA	M	2020	Alex	379
401788	CA	M	2021	Alex	333
404663	CA	M	2022	Alex	344

	State	Sex	Year	Name	Count
76	CA	F	1910	Norma	23
83	CA	F	1910	Nellie	20
127	CA	F	1910	Nina	11
198	CA	F	1910	Nora	6
310	CA	F	1911	Nellie	23
...	...	...	...	...	...
407319	CA	M	2022	Nilan	5
407320	CA	M	2022	Niles	5
407321	CA	M	2022	Nolen	5
407322	CA	M	2022	Noriel	5
407323	CA	M	2022	Norris	5

	State	Sex	Year	Name	Count
268041	CA	M	1957	Michael	8260
267017	CA	M	1956	Michael	8258
317387	CA	M	1990	Michael	8246
281850	CA	M	1969	Michael	8245
283146	CA	M	1970	Michael	8196
...	...	...	...	...	...
71283	CA	F	1974	Keli	5
71282	CA	F	1974	Kaye	5
71281	CA	F	1974	Katherin	5
71280	CA	F	1974	Kasie	5
71288	CA	F	1974	Kourtney	5

	State	Sex	Year	Name	Count	name_lengths
313977	CA	M	1988	Franciscojavier	10	15
337301	CA	M	1997	Franciscojavier	5	15
316886	CA	M	1989	Franciscojavier	6	15
339472	CA	M	1998	Franciscojavier	6	15
102505	CA	F	1986	Mariadelosangel	5	15

	State	Sex	Year	Name	Count
321913	CA	M	1991	Franciscojavier	6
327472	CA	M	1993	Ryanchristopher	5
313977	CA	M	1988	Franciscojavier	10
337301	CA	M	1997	Franciscojavier	5
316886	CA	M	1989	Franciscojavier	6

	State	Sex	Year	Name	Count	dr_ea_count
131029	CA	F	1994	Leandrea	5	3
115957	CA	F	1990	Deandrea	5	3
101976	CA	F	1986	Deandrea	6	3
308131	CA	M	1985	Deandrea	6	3
108731	CA	F	1988	Deandrea	5	3

Lecture 3 – Data 100, Summer 2025¶

Dataset: California baby names¶

Conditional Selection¶

Slido Exercise¶

Adding, Removing, and Modifying Columns¶

Slido Exercise¶

Useful Utility Functions¶

NumPy¶

Built-In pandas Methods¶

Slido Exercise¶

Custom sorting¶

Approach 1: Create a temporary column¶

Approach 2: Sorting using the key argument¶

Approach 3: Sorting Using the map Function¶

`NumPy`¶

Built-In `pandas` Methods¶

Approach 2: Sorting using the `key` argument¶

Approach 3: Sorting Using the `map` Function¶