import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile
import pandas as pd

states = pd.read_csv("data/county_and_state.csv")
populations = pd.read_csv("data/county_and_population.csv")

# display() allows us to view a DataFrame without returning it as an object
display(states)
display(populations)

states.merge(populations, left_on="County", right_on="County")

# Function to transform a series of county names into a standard form
def canonicalize_county(county_series):
    canonicalized_series = (
        county_series
        # make lowercase
        .str.lower()   
        # remove spaces            
        .str.replace(' ', '')    
        # replace & with and               
        .str.replace('&', 'and')   
         # remove dots             
        .str.replace('.', '')       
        # remove "county"           
        .str.replace('county', '')
        # remove "parish" 
        .str.replace('parish', '')              
    )
    return (canonicalized_series)

display(canonicalize_county(states["County"]))
display(canonicalize_county(populations["County"]))

0              dewitt
1         lacquiparle
2       lewisandclark
3    stjohnthebaptist
Name: County, dtype: object

0              dewitt
1         lacquiparle
2       lewisandclark
3    stjohnthebaptist
Name: County, dtype: object

states["Canonical County"] = canonicalize_county(states["County"])

populations["Canonical County"] = canonicalize_county(populations["County"])

display(states)
display(populations)

states.merge(populations, on="Canonical County")

# Sample log file
log_fname = 'data/log.txt'

with open(log_fname, 'r') as f:
    # readlines() returns a list of strings, 
    # with each element representing a line in the file
    log_lines = f.readlines()
    
log_lines

['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
 '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
 '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']

# 20:31 were determined by trial-and-error!
log_lines[0][20:31]

'26/Jan/2014'

log_lines[1][20:31]

'/Feb/2005:1'

first = log_lines[0]
first

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

# find the text enclosed in square brackets
pertinent = (

    # remove everything before the first [
    first.split("[")[1] 

    # Remove everything after the second square ]
    .split(']')[0] 

) 

pertinent

'26/Jan/2014:10:47:58 -0800'

# grab the date, month, and the rest of the pertinent string (`rest`)
day, month, rest  = pertinent.split('/')        

print("Day:   ", day)
print("Month: ", month)
print("Rest:  ", rest)

Day:    26
Month:  Jan
Rest:   2014:10:47:58 -0800

# from `rest`, grab the year, hour, minute, and remaining characters
year, hour, minute, rest2 = rest.split(':')    

print("Year:   ", year)
print("Hour:   ", hour)
print("Minute: ", minute)
print("Rest:   ", rest2)

Year:    2014
Hour:    10
Minute:  47
Rest:    58 -0800

# from `rest2`, grab the seconds and time zone
seconds, time_zone = rest2.split(' ') 
print("Seconds:   ", seconds)
print("Time Zone:   ", time_zone)

Seconds:    58
Time Zone:    -0800

# Print all the components we've extracted
day, month, year, hour, minute, seconds, time_zone

('26', 'Jan', '2014', '10', '47', '58', '-0800')

logs = pd.read_csv("data/log.txt", 
                sep="\t", 
                header=None)[0]

print("Original input:")
display(logs)

Original input:

0    169.237.46.168 - - [26/Jan/2014:10:47:58 -0800...
1    193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "...
2    169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800...
Name: 0, dtype: object

# Previous code:
# first = '26/Jan/2014:10:47:58 -0800'
# pertinent = first.split("[")[1].split(']')[0]

s1 = (
  logs.str.split("[")
      .str[1]
      .str.split("]")
      .str[0]
)
display(s1)

0    26/Jan/2014:10:47:58 -0800
1      2/Feb/2005:17:23:6 -0800
2     3/Feb/2006:10:18:37 -0800
Name: 0, dtype: object

# Previous code:
# day, month, rest  = pertinent.split('/') 

df1 = (
  # expand=True creates a column for each element of the split
  s1.str.split("/", expand=True)
  .rename(columns={0: "Day", 1: "Month", 2: "Rest"})
)
df1

# Previous code:
# year, hour, minute, rest2 = rest.split(':') 

rest_df = (
  df1["Rest"].str.split(":", expand=True)
  .rename(columns={0: "Year", 1: "Hour", 2: "Minute", 3: "Rest2"})
)
display(rest_df)

df2 = (
  # merge based on the index, not a particular column
  df1.merge(rest_df, left_index=True, right_index=True)
  .drop(columns=["Rest"])
)
df2

# Previous code:
# seconds, time_zone = rest.split(' ')

rest2_df = (
  df2["Rest2"].str.split(" ", expand=True)
  .rename(columns = {0: "Seconds", 1: "Timezone"})
)
rest2_df

df3 = (
    df2.merge(rest2_df, left_index=True, right_index=True)
    .drop(columns=["Rest2"])
)

print("Final Dataframe:")
display(df3)

Final Dataframe:

# prints newline
print('One backslash:')
print('\n')

# prints \n
print('Two backslashes:')
print('\\n')

# prints \ followed by newline
print('Three backslashes:')
print('\\\n')

# prints \\n
print('Four backslashes:')
print('\\\\n')

# also prints \\n, but much more obviously!
print('Raw string:')
print(r'\\n')

One backslash:


Two backslashes:
\n
Three backslashes:
\

Four backslashes:
\\n
Raw string:
\\n

import re

text = "My social security number is 123-45-6789 bro, or actually maybe it’s 321-45-6789.";

pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"

re.findall(pattern, text)

['123-45-6789', '321-45-6789']

df_ssn = pd.DataFrame(
    ['987-65-4321',
     'forty',
     '123-45-6789 bro or 321-45-6789',
     '999-99-9999'],
    columns=['SSN'])
df_ssn

# Series of lists
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
df_ssn['SSN'].str.findall(pattern)

0                 [987-65-4321]
1                            []
2    [123-45-6789, 321-45-6789]
3                 [999-99-9999]
Name: SSN, dtype: object

# For example, grab the final match from each list
(
  df_ssn['SSN']
  .str.findall(pattern)
  .str[-1] 
)

0    987-65-4321
1            NaN
2    321-45-6789
3    999-99-9999
Name: SSN, dtype: object

text = """I will meet you at 08:30:00 pm tomorrow"""       
pattern = ".*(\d\d):(\d\d):(\d\d).*"
matches = re.findall(pattern, text)
matches

[('08', '30', '00')]

# The three capture groups in the first matched string
hour, minute, second = matches[0]
print("Hour:   ", hour)
print("Minute: ", minute)
print("Second: ", second)

Hour:    08
Minute:  30
Second:  00

# back to SSNs
df_ssn

# Will extract the first match of all groups
pattern_group_mult = r"([0-9]{3})-([0-9]{2})-([0-9]{4})" # 3 groups
df_ssn['SSN'].str.extract(pattern_group_mult)

# DataFrame, one row per match
df_ssn['SSN'].str.extractall(pattern_group_mult)

text = '<div><td valign="top">Moo</td></div>'
pattern = r"<[^>]+>"
re.sub(pattern, '', text)

'Moo'

# example dataframe of strings
df_html = pd.DataFrame(
  [
    '<div><td valign="top">Moo</td></div>',
    '<a href="http://ds100.org">Link</a>',
    '<b>Bold text</b>'
  ], 
  columns=['Html'])

df_html

# Series -> Series
df_html["Html"].str.replace(pattern, '', regex=True).to_frame()

line = log_lines[0]
display(line)

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, line)[0] # get first match
day, month, year, hour, minute, second, time_zone

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

('26', 'Jan', '2014', '10', '47', '58', '-0800')

df = pd.DataFrame(log_lines, columns=['Log'])
df

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
df['Log'].str.findall(pattern)

0    [(26, Jan, 2014, 10, 47, 58, -0800)]
1      [(2, Feb, 2005, 17, 23, 6, -0800)]
2     [(3, Feb, 2006, 10, 18, 37, -0800)]
Name: Log, dtype: object

df['Log'].str.extractall(pattern)

# your code here (optional)

vio = pd.read_csv('data/violations.csv', header=0, names=['bid', 'date', 'desc'])
desc = vio['desc']
vio.head()

counts = desc.value_counts()
counts.shape

(14253,)

counts[:10]

desc
Unclean or degraded floors walls or ceilings                          999
Unapproved or unmaintained equipment or utensils                      659
Inadequately cleaned or sanitized food contact surfaces               493
Improper food storage                                                 476
Inadequate and inaccessible handwashing facilities                    467
Moderate risk food holding temperature                                452
Wiping cloths not clean or properly stored or inadequate sanitizer    418
Moderate risk vermin infestation                                      374
Unclean nonfood contact surfaces                                      369
Food safety certificate or food handler card not available            353
Name: count, dtype: int64

# Hmmm...
counts[50:60]

desc
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/29/2017 ]              16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/19/2017 ]               16
Inadequate HACCP plan record keeping                                                                16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/27/2017 ]              15
Unclean or degraded floors walls or ceilings  [ date violation corrected: 12/7/2017 ]               15
Inadequately cleaned or sanitized food contact surfaces  [ date violation corrected: 9/26/2017 ]    14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/28/2017 ]              14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/6/2017 ]                14
Unapproved or unmaintained equipment or utensils  [ date violation corrected: 9/19/2017 ]           14
Unapproved  living quarters in food facility                                                        13
Name: count, dtype: int64

# Use regular expressions to cut out the extra info in square braces.
vio['clean_desc'] = (vio['desc']
             .str.replace(r'\s*\[.*\]$', '', regex=True)
             .str.strip()       # removes leading/trailing whitespace
             .str.lower())
vio.head()

# canonicalizing definitely helped
vio['clean_desc'].value_counts().shape

(68,)

vio['clean_desc'].value_counts().head()

clean_desc
unclean or degraded floors walls or ceilings               3507
moderate risk food holding temperature                     2542
inadequate and inaccessible handwashing facilities         2529
unapproved or unmaintained equipment or utensils           2382
inadequately cleaned or sanitized food contact surfaces    2301
Name: count, dtype: int64

# use regular expressions to assign new features for the presence of various keywords
# regex metacharacter | 
with_features = (vio
 .assign(is_unclean     = vio['clean_desc'].str.contains('clean|sanit'))
 .assign(is_high_risk = vio['clean_desc'].str.contains('high risk'))
 .assign(is_vermin    = vio['clean_desc'].str.contains('vermin'))
 .assign(is_surface   = vio['clean_desc'].str.contains('wall|ceiling|floor|surface'))
 .assign(is_human     = vio['clean_desc'].str.contains('hand|glove|hair|nail'))
 .assign(is_permit    = vio['clean_desc'].str.contains('permit|certif'))
)
with_features.head()

count_features = (with_features
 .groupby(['bid', 'date'])
 .sum(numeric_only=True)
 .reset_index()
)
count_features.iloc[255:260, :]

count_features[count_features['is_vermin'] > 1].head(5)

violation_type_df = pd.melt(count_features, id_vars=['bid', 'date'],
            var_name='feature', value_name='num_vios')

# show a particular inspection's results
violation_type_df[(violation_type_df['bid'] == 489) & (violation_type_df['date'] == 20150728)]

# read in the scores
inspection_df = pd.read_csv('data/inspections.csv',
                  header=0,
                  usecols=[0, 1, 2],
                  names=['bid', 'score', 'date'])
inspection_df.head()

# join scores with the table broken down by violation type
violation_type_and_scores = (
    violation_type_df
    .merge(inspection_df, on=['bid', 'date'])
)
violation_type_and_scores.head(12)

# you will learn this syntax next week. Focus on interpreting for now.
sns.catplot(x='num_vios', y='score',
               col='feature', col_wrap=2,
               kind='box',
               data=violation_type_and_scores);

	County	State
0	De Witt County	IL
1	Lac qui Parle County	MN
2	Lewis and Clark County	MT
3	St John the Baptist Parish	LS

	County	Population
0	DeWitt	16798
1	Lac Qui Parle	8067
2	Lewis & Clark	55716
3	St. John the Baptist	43044

	County	State	Canonical County
0	De Witt County	IL	dewitt
1	Lac qui Parle County	MN	lacquiparle
2	Lewis and Clark County	MT	lewisandclark
3	St John the Baptist Parish	LS	stjohnthebaptist

	County	Population	Canonical County
0	DeWitt	16798	dewitt
1	Lac Qui Parle	8067	lacquiparle
2	Lewis & Clark	55716	lewisandclark
3	St. John the Baptist	43044	stjohnthebaptist

	County_x	State	Canonical County	County_y	Population
0	De Witt County	IL	dewitt	DeWitt	16798
1	Lac qui Parle County	MN	lacquiparle	Lac Qui Parle	8067
2	Lewis and Clark County	MT	lewisandclark	Lewis & Clark	55716
3	St John the Baptist Parish	LS	stjohnthebaptist	St. John the Baptist	43044

💬 Lecture 6, Regex – Data 100, Spring 2025¶

🤠 Text Wrangling and Regex¶

🥫 Demo 1: Canonicalizing County Names¶

🐼 Using Pandas String Functions¶

🪵 Demo 2: Extracting Data from Log Files¶

💬 Regular Expressions¶

🎻 String Extraction with Regex¶

🧲 Extraction Using Regex Capture Groups¶

🥫 Canonicalization with Regex (`re.sub`, `Series.str.replace`)¶

🎁 (Optional) Bonus material¶

🪵 Revisiting Text Log Processing using Regex¶

Python `re` version¶

`pandas` version¶

🍽️ Real World Case Study: Restaurant Data¶

📊 EDA¶

	Day	Month	Rest
0	26	Jan	2014:10:47:58 -0800
1	2	Feb	2005:17:23:6 -0800
2	3	Feb	2006:10:18:37 -0800

	Day	Month	Year	Hour	Minute	Rest2
0	26	Jan	2014	10	47	58 -0800
1	2	Feb	2005	17	23	6 -0800
2	3	Feb	2006	10	18	37 -0800

	Day	Month	Year	Hour	Minute	Seconds	Timezone
0	26	Jan	2014	10	47	58	-0800
1	2	Feb	2005	17	23	6	-0800
2	3	Feb	2006	10	18	37	-0800

	SSN
0	987-65-4321
1	forty
2	123-45-6789 bro or 321-45-6789
3	999-99-9999

	Html
0	<div><td valign="top">Moo</td></div>
1	<a href="http://ds100.org">Link</a>
2	<b>Bold text</b>

	Log
0	169.237.46.168 - - [26/Jan/2014:10:47:58 -0800...
1	193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "...
2	169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800...

		0	1	2	3	4	5	6
	match
0	0	26	Jan	2014	10	47	58	-0800
1	0	2	Feb	2005	17	23	6	-0800
2	0	3	Feb	2006	10	18	37	-0800

	bid	date	desc
0	19	20171211	Inadequate food safety knowledge or lack of ce...
1	19	20171211	Unapproved or unmaintained equipment or utensils
2	19	20160513	Unapproved or unmaintained equipment or utensi...
3	19	20160513	Unclean or degraded floors walls or ceilings ...
4	19	20160513	Food safety certificate or food handler card n...

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
256	489	20150807	1	0	0	1	0	0
257	489	20160308	2	2	1	0	1	0
258	489	20160721	2	1	1	1	0	1
259	489	20161220	3	0	1	2	0	0

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
291	527	20170821	1	1	2	1	1	1
1508	2622	20160526	4	2	2	3	0	0
1573	2721	20150422	2	1	2	1	0	0
1746	2945	20150921	2	1	2	2	2	1

	0	1	2
0	987	65	4321
1	NaN	NaN	NaN
2	123	45	6789
3	999	99	9999

		0	1	2
	match
0	0	987	65	4321
2	0	123	45	6789
2	1	321	45	6789
3	0	999	99	9999

	Html
0	Moo
1	Link
2	Bold text

	bid	score	date
0	19	94	20160513
1	19	94	20171211
2	24	98	20171101
3	24	98	20161005
4	24	96	20160311

	bid	date	feature	num_vios
255	489	20150728	is_unclean	5
12517	489	20150728	is_high_risk	0
24779	489	20150728	is_vermin	2
37041	489	20150728	is_surface	3
49303	489	20150728	is_human	0
61565	489	20150728	is_permit	0

💬 Lecture 6, Regex – Data 100, Spring 2025¶

🤠 Text Wrangling and Regex¶

🥫 Demo 1: Canonicalizing County Names¶

🐼 Using Pandas String Functions¶

🪵 Demo 2: Extracting Data from Log Files¶

💬 Regular Expressions¶

🎻 String Extraction with Regex¶

🧲 Extraction Using Regex Capture Groups¶

🥫 Canonicalization with Regex (re.sub, Series.str.replace)¶

🎁 (Optional) Bonus material¶

🪵 Revisiting Text Log Processing using Regex¶

Python re version¶

pandas version¶

🍽️ Real World Case Study: Restaurant Data¶

📊 EDA¶

🥫 Canonicalization with Regex (`re.sub`, `Series.str.replace`)¶

Python `re` version¶

`pandas` version¶