import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile
import pandas as pd

states = pd.read_csv("data/county_and_state.csv")
populations = pd.read_csv("data/county_and_population.csv")

# display allows us to view a DataFrame without returning it as an object
display(states)

display(populations)

states.merge(populations, left_on="County", right_on="County")

def canonicalize_county(county_series):
    return (county_series.str.lower()               # lowercase
            .str.replace(' ', '')                   # remove space
            .str.replace('&', 'and')                # replace &
            .str.replace('.', '')                   # remove dot
            .str.replace('county', '')              # remove "county"
            .str.replace('parish', '')              # remove "parish" 
            )

states["County"] = canonicalize_county(states["County"])
populations["County"] = canonicalize_county(populations["County"])

display(states)

display(populations)

states.merge(populations, left_on="County", right_on="County")

log_fname = 'data/log.txt'
with open(log_fname, 'r') as f:
    log_lines = f.readlines()
log_lines

['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
 '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
 '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']

log_lines[0][20:31]

'26/Jan/2014'

log_lines[1][20:31]

'/Feb/2005:1'

first = log_lines[0]
first

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

pertinent = first.split("[")[1].split(']')[0] # find the text enclosed in square brackets
day, month, rest = pertinent.split('/')       # split up the date/month/year 
year, hour, minute, rest = rest.split(':')    # split up the hour:minute:second
seconds, time_zone = rest.split(' ')          # split the timezone after the blank space
day, month, year, hour, minute, seconds, time_zone

('26', 'Jan', '2014', '10', '47', '58', '-0800')

import re

text = "My social security number is 123-45-6789 bro, or actually maybe it’s 321-45-6789.";
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
re.findall(pattern, text)

['123-45-6789', '321-45-6789']

df_ssn = pd.DataFrame(
    ['987-65-4321',
     'forty',
     '123-45-6789 bro or 321-45-6789',
     '999-99-9999'],
    columns=['SSN'])
df_ssn

# -> Series of lists
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
df_ssn['SSN'].str.findall(pattern)

0                 [987-65-4321]
1                            []
2    [123-45-6789, 321-45-6789]
3                 [999-99-9999]
Name: SSN, dtype: object

text = """I will meet you at 08:30:00 pm tomorrow"""       
pattern = ".*(\d\d):(\d\d):(\d\d).*"
matches = re.findall(pattern, text)
matches

[('08', '30', '00')]

# the three capture groups in the first matched string
hour, minute, second = matches[0]

# back to SSNs
df_ssn

# Will extract the first match of all groups
pattern_group_mult = r"([0-9]{3})-([0-9]{2})-([0-9]{4})" # 3 groups
df_ssn['SSN'].str.extract(pattern_group_mult)

# -> DataFrame, one row per match
df_ssn['SSN'].str.extractall(pattern_group_mult)

text = '<div><td valign="top">Moo</td></div>'
pattern = r"<[^>]+>"
re.sub(pattern, '', text)

'Moo'

# example dataframe of strings
df_html = pd.DataFrame(['<div><td valign="top">Moo</td></div>',
                   '<a href="http://ds100.org">Link</a>',
                   '<b>Bold text</b>'], columns=['Html'])
df_html

# Series -> Series
df_html["Html"].str.replace(pattern, '', regex=True).to_frame()

line = log_lines[0]
display(line)

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, line)[0] # get first match
day, month, year, hour, minute, second, time_zone

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

('26', 'Jan', '2014', '10', '47', '58', '-0800')

df = pd.DataFrame(log_lines, columns=['Log'])
df

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
df['Log'].str.findall(pattern)

0    [(26, Jan, 2014, 10, 47, 58, -0800)]
1      [(2, Feb, 2005, 17, 23, 6, -0800)]
2     [(3, Feb, 2006, 10, 18, 37, -0800)]
Name: Log, dtype: object

df['Log'].str.extractall(pattern)

# your code here
...

vio = pd.read_csv('data/violations.csv', header=0, names=['bid', 'date', 'desc'])
desc = vio['desc']
vio.head()

counts = desc.value_counts()
counts.shape

(14253,)

counts[:10]

desc
Unclean or degraded floors walls or ceilings                          999
Unapproved or unmaintained equipment or utensils                      659
Inadequately cleaned or sanitized food contact surfaces               493
Improper food storage                                                 476
Inadequate and inaccessible handwashing facilities                    467
Moderate risk food holding temperature                                452
Wiping cloths not clean or properly stored or inadequate sanitizer    418
Moderate risk vermin infestation                                      374
Unclean nonfood contact surfaces                                      369
Food safety certificate or food handler card not available            353
Name: count, dtype: int64

# Hmmm...
counts[50:60]

desc
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/29/2017 ]              16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/19/2017 ]               16
Inadequate HACCP plan record keeping                                                                16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/27/2017 ]              15
Unclean or degraded floors walls or ceilings  [ date violation corrected: 12/7/2017 ]               15
Inadequately cleaned or sanitized food contact surfaces  [ date violation corrected: 9/26/2017 ]    14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/28/2017 ]              14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/6/2017 ]                14
Unapproved or unmaintained equipment or utensils  [ date violation corrected: 9/19/2017 ]           14
Unapproved  living quarters in food facility                                                        13
Name: count, dtype: int64

# Use regular expressions to cut out the extra info in square braces.
vio['clean_desc'] = (vio['desc']
             .str.replace(r'\s*\[.*\]$', '', regex=True)
             .str.strip()       # removes leading/trailing whitespace
             .str.lower())
vio.head()

# canonicalizing definitely helped
vio['clean_desc'].value_counts().shape

(68,)

vio['clean_desc'].value_counts().head()

clean_desc
unclean or degraded floors walls or ceilings               3507
moderate risk food holding temperature                     2542
inadequate and inaccessible handwashing facilities         2529
unapproved or unmaintained equipment or utensils           2382
inadequately cleaned or sanitized food contact surfaces    2301
Name: count, dtype: int64

# use regular expressions to assign new features for the presence of various keywords
# regex metacharacter | 
with_features = (vio
 .assign(is_unclean     = vio['clean_desc'].str.contains('clean|sanit'))
 .assign(is_high_risk = vio['clean_desc'].str.contains('high risk'))
 .assign(is_vermin    = vio['clean_desc'].str.contains('vermin'))
 .assign(is_surface   = vio['clean_desc'].str.contains('wall|ceiling|floor|surface'))
 .assign(is_human     = vio['clean_desc'].str.contains('hand|glove|hair|nail'))
 .assign(is_permit    = vio['clean_desc'].str.contains('permit|certif'))
)
with_features.head()

count_features = (with_features
 .groupby(['bid', 'date'])
 .sum(numeric_only=True)
 .reset_index()
)
count_features.iloc[255:260, :]

count_features[count_features['is_vermin'] > 1].head(5)

violation_type_df = pd.melt(count_features, id_vars=['bid', 'date'],
            var_name='feature', value_name='num_vios')

# show a particular inspection's results
violation_type_df[(violation_type_df['bid'] == 489) & (violation_type_df['date'] == 20150728)]

# read in the scores
inspection_df = pd.read_csv('data/inspections.csv',
                  header=0,
                  usecols=[0, 1, 2],
                  names=['bid', 'score', 'date'])
inspection_df.head()

# join scores with the table broken down by violation type
violation_type_and_scores = (
    violation_type_df
    .merge(inspection_df, on=['bid', 'date'])
)
violation_type_and_scores.head(12)

# you will learn this syntax next week. Focus on interpreting for now.
sns.catplot(x='num_vios', y='score',
               col='feature', col_wrap=2,
               kind='box',
               data=violation_type_and_scores);

	County	State
0	De Witt County	IL
1	Lac qui Parle County	MN
2	Lewis and Clark County	MT
3	St John the Baptist Parish	LS

	County	Population
0	DeWitt	16798
1	Lac Qui Parle	8067
2	Lewis & Clark	55716
3	St. John the Baptist	43044

	County	State
0	dewitt	IL
1	lacquiparle	MN
2	lewisandclark	MT
3	stjohnthebaptist	LS

	County	Population
0	dewitt	16798
1	lacquiparle	8067
2	lewisandclark	55716
3	stjohnthebaptist	43044

	County	State	Population
0	dewitt	IL	16798
1	lacquiparle	MN	8067
2	lewisandclark	MT	55716
3	stjohnthebaptist	LS	43044

Text Wrangling and Regex¶

Demo 1: Canonicalizing County Names¶

Demo 2: Extracting Log Data¶

Regular Expressions¶

String Extraction with Regex¶

Extraction Using Regex Capture Groups¶

Canonicalization with Regex¶

Revisiting Text Log Processing using Regex¶

Python `re` version¶

`pandas` version¶

Real World Case Study: Restaurant Data¶

EDA¶

	SSN
0	987-65-4321
1	forty
2	123-45-6789 bro or 321-45-6789
3	999-99-9999

	Html
0	<div><td valign="top">Moo</td></div>
1	<a href="http://ds100.org">Link</a>
2	<b>Bold text</b>

	Log
0	169.237.46.168 - - [26/Jan/2014:10:47:58 -0800...
1	193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "...
2	169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800...

		0	1	2	3	4	5	6
	match
0	0	26	Jan	2014	10	47	58	-0800
1	0	2	Feb	2005	17	23	6	-0800
2	0	3	Feb	2006	10	18	37	-0800

	bid	date	desc
0	19	20171211	Inadequate food safety knowledge or lack of ce...
1	19	20171211	Unapproved or unmaintained equipment or utensils
2	19	20160513	Unapproved or unmaintained equipment or utensi...
3	19	20160513	Unclean or degraded floors walls or ceilings ...
4	19	20160513	Food safety certificate or food handler card n...

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
256	489	20150807	1	0	0	1	0	0
257	489	20160308	2	2	1	0	1	0
258	489	20160721	2	1	1	1	0	1
259	489	20161220	3	0	1	2	0	0

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
291	527	20170821	1	1	2	1	1	1
1508	2622	20160526	4	2	2	3	0	0
1573	2721	20150422	2	1	2	1	0	0
1746	2945	20150921	2	1	2	2	2	1

	bid	date	feature	num_vios
255	489	20150728	is_unclean	5
12517	489	20150728	is_high_risk	0
24779	489	20150728	is_vermin	2
37041	489	20150728	is_surface	3
49303	489	20150728	is_human	0
61565	489	20150728	is_permit	0

	bid	score	date
0	19	94	20160513
1	19	94	20171211
2	24	98	20171101
3	24	98	20161005
4	24	96	20160311

	0	1	2
0	987	65	4321
1	NaN	NaN	NaN
2	123	45	6789
3	999	99	9999

		0	1	2
	match
0	0	987	65	4321
2	0	123	45	6789
2	1	321	45	6789
3	0	999	99	9999

	Html
0	Moo
1	Link
2	Bold text

Text Wrangling and Regex¶

Demo 1: Canonicalizing County Names¶

Demo 2: Extracting Log Data¶

Regular Expressions¶

String Extraction with Regex¶

Extraction Using Regex Capture Groups¶

Canonicalization with Regex¶

Revisiting Text Log Processing using Regex¶

Python re version¶

pandas version¶

Real World Case Study: Restaurant Data¶

EDA¶

Python `re` version¶

`pandas` version¶