import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile


county_and_state = pd.read_csv('data/county_and_state.csv')
county_and_pop = pd.read_csv('data/county_and_population.csv')


county_and_state


county_and_pop


def canonicalize_county(county_name):
    return (
        county_name
        .lower()               # lower case
        .replace(' ', '')      # remove spaces
        .replace('&', 'and')   # replace &
        .replace('.', '')      # remove dot
        .replace('county', '') # remove county
        .replace('parish', '') # remove parish
    )


county_and_pop['clean_county'] = county_and_pop['County'].map(canonicalize_county)
county_and_state['clean_county'] = county_and_state['County'].map(canonicalize_county)

display(county_and_pop)  # display outputs even if not last line in cell - like a fancy print()
county_and_state


county_and_pop.merge(county_and_state, on='clean_county')


log_fname = 'data/log.txt'
!cat {log_fname}

169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"
193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"
169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"


with open(log_fname, 'r') as f:
    log_lines = f.readlines()


log_lines

['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
 '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
 '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']


log_lines[0][20:31]

'26/Jan/2014'


log_lines[1][20:31]

'/Feb/2005:1'


first = log_lines[0]
first

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'


pertinent = first.split("[")[1].split(']')[0]
day, month, rest = pertinent.split('/')
year, hour, minute, rest = rest.split(':')
seconds, time_zone = rest.split(' ')
day, month, year, hour, minute, seconds, time_zone

('26', 'Jan', '2014', '10', '47', '58', '-0800')


import re


text = '<div><td valign="top">Moo</td></div>'
pattern = r"<[^>]+>"
re.sub(pattern, '', text)

'Moo'


df_html = pd.DataFrame(['<div><td valign="top">Moo</td></div>',
                   '<a href="http://ds100.org">Link</a>',
                   '<b>Bold text</b>'], columns=['Html'])
df_html


# Series -> Series
df_html["Html"].str.replace(pattern, '', regex=True).to_frame()


text = "My social security number is 123-45-6789 bro, or actually maybe it’s 321-45-6789.";
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
re.findall(pattern, text)  # ['123-45-6789', '321-45-6789']

['123-45-6789', '321-45-6789']


text = """Observations: 03:04:53 - Horse awakens.
03:05:14 - Horse goes back to sleep."""       
pattern = r"(\d\d):(\d\d):(\d\d) - (.*)"
re.findall(pattern, text)

[('03', '04', '53', 'Horse awakens.'),
 ('03', '05', '14', 'Horse goes back to sleep.')]


df_ssn = pd.DataFrame(
    ['987-65-4321',
     'forty',
     '123-45-6789 bro or 321-45-6789',
     '999-99-9999'],
    columns=['SSN'])
df_ssn


# -> Series of lists
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
df_ssn['SSN'].str.findall(pattern)

0                 [987-65-4321]
1                            []
2    [123-45-6789, 321-45-6789]
3                 [999-99-9999]
Name: SSN, dtype: object


# -> DataFrame of first match group
pattern_group = r"([0-9]{3}-[0-9]{2}-[0-9]{4})" # 1 group
df_ssn['SSN'].str.extract(pattern_group)


# Will extract first match of all groups
pattern_group_mult = r"([0-9]{3})-([0-9]{2})-([0-9]{4})" # 3 groups
df_ssn['SSN'].str.extract(pattern_group_mult)


# -> DataFrame, one row per match
df_ssn['SSN'].str.extractall(pattern_group_mult)


# original dataframe
df_ssn


line = log_lines[0]
display(line)

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, line)[0] # get first match
day, month, year, hour, minute, second, time_zone

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

('26', 'Jan', '2014', '10', '47', '58', '-0800')


rx = re.compile(pattern)
rx

re.compile(r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]', re.UNICODE)


rx.search(line)

<re.Match object; span=(19, 47), match='[26/Jan/2014:10:47:58 -0800]'>


out = rx.search(line)


out = rx.search(line)
out.group(0)

'[26/Jan/2014:10:47:58 -0800]'


inputs = [line, "blah blah blah"]
for l in inputs:
    out = rx.search(l)
    if out:
        print(out.group(0))
    else:
        print(f'*** No match for: {l[0:5]} ...')

[26/Jan/2014:10:47:58 -0800]
*** No match for: blah  ...


# beyond the scope of lecture, but left here for your interest
day, month, year, hour, minute, second, time_zone = re.search(pattern, line).groups()
day, month, year, hour, minute, second, time_zone

('26', 'Jan', '2014', '10', '47', '58', '-0800')


df = pd.DataFrame(log_lines, columns=['Log'])
df


pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
df['Log'].str.findall(pattern)

0    [(26, Jan, 2014, 10, 47, 58, -0800)]
1      [(2, Feb, 2005, 17, 23, 6, -0800)]
2     [(3, Feb, 2006, 10, 18, 37, -0800)]
Name: Log, dtype: object


df['Log'].str.extractall(pattern)


# your code here
...


vio = pd.read_csv('data/violations.csv', header=0, names=['bid', 'date', 'desc'])
desc = vio['desc']
vio.head()


counts = desc.value_counts()
counts.shape

(14253,)


counts[:10]

Unclean or degraded floors walls or ceilings                          999
Unapproved or unmaintained equipment or utensils                      659
Inadequately cleaned or sanitized food contact surfaces               493
Improper food storage                                                 476
Inadequate and inaccessible handwashing facilities                    467
Moderate risk food holding temperature                                452
Wiping cloths not clean or properly stored or inadequate sanitizer    418
Moderate risk vermin infestation                                      374
Unclean nonfood contact surfaces                                      369
Food safety certificate or food handler card not available            353
Name: desc, dtype: int64


# Hmmm...
counts[50:60]

Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/29/2017 ]              16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/19/2017 ]               16
Inadequate HACCP plan record keeping                                                                16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/27/2017 ]              15
Unclean or degraded floors walls or ceilings  [ date violation corrected: 12/7/2017 ]               15
Inadequately cleaned or sanitized food contact surfaces  [ date violation corrected: 9/26/2017 ]    14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/28/2017 ]              14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/6/2017 ]                14
Unapproved or unmaintained equipment or utensils  [ date violation corrected: 9/19/2017 ]           14
Unapproved  living quarters in food facility                                                        13
Name: desc, dtype: int64


# Use regular expressions to cut out the extra info in square braces.
vio['clean_desc'] = (vio['desc']
             .str.replace(r'\s*\[.*\]$', '', regex=True)
             .str.strip()       # removes leading/trailing whitespace
             .str.lower())
vio.head()


# canonicalizing definitely helped
vio['clean_desc'].value_counts().shape

(68,)


vio['clean_desc'].value_counts().head()

unclean or degraded floors walls or ceilings               3507
moderate risk food holding temperature                     2542
inadequate and inaccessible handwashing facilities         2529
unapproved or unmaintained equipment or utensils           2382
inadequately cleaned or sanitized food contact surfaces    2301
Name: clean_desc, dtype: int64


# use regular expressions to assign new features for the presence of various keywords
# regex metacharacter | 
with_features = (vio
 .assign(is_unclean     = vio['clean_desc'].str.contains('clean|sanit'))
 .assign(is_high_risk = vio['clean_desc'].str.contains('high risk'))
 .assign(is_vermin    = vio['clean_desc'].str.contains('vermin'))
 .assign(is_surface   = vio['clean_desc'].str.contains('wall|ceiling|floor|surface'))
 .assign(is_human     = vio['clean_desc'].str.contains('hand|glove|hair|nail'))
 .assign(is_permit    = vio['clean_desc'].str.contains('permit|certif'))
)
with_features.head()


count_features = (with_features
 .groupby(['bid', 'date'])
 .sum()
 .reset_index()
)
count_features.iloc[255:260, :]


count_features.query('is_vermin > 1').head(5)


broken_down_by_violation_type = pd.melt(count_features, id_vars=['bid', 'date'],
            var_name='feature', value_name='num_vios')

# show a particular inspection's results
broken_down_by_violation_type.query('bid == 489 & date == 20150728')


# read in the scores
ins = pd.read_csv('data/inspections.csv',
                  header=0,
                  usecols=[0, 1, 2],
                  names=['bid', 'score', 'date'])
ins.head()


# join scores with the table broken down by violation type
violation_type_and_scores = (
    broken_down_by_violation_type
    .merge(ins, on=['bid', 'date'])
)
violation_type_and_scores.head(12)


# you will learn this syntax next week. Focus on interpreting for now.
sns.catplot(x='num_vios', y='score',
               col='feature', col_wrap=2,
               kind='box',
               data=violation_type_and_scores);


pd.Series(log_lines).str.extract(r'\[(.*) -0800\]').apply(
    lambda s: pd.to_datetime(s, format='%d/%b/%Y:%H:%M:%S'))

	County	State
0	De Witt County	IL
1	Lac qui Parle County	MN
2	Lewis and Clark County	MT
3	St John the Baptist Parish	LS

	County	Population
0	DeWitt	16798
1	Lac Qui Parle	8067
2	Lewis & Clark	55716
3	St. John the Baptist	43044

	County	Population	clean_county
0	DeWitt	16798	dewitt
1	Lac Qui Parle	8067	lacquiparle
2	Lewis & Clark	55716	lewisandclark
3	St. John the Baptist	43044	stjohnthebaptist

	County	State	clean_county
0	De Witt County	IL	dewitt
1	Lac qui Parle County	MN	lacquiparle
2	Lewis and Clark County	MT	lewisandclark
3	St John the Baptist Parish	LS	stjohnthebaptist

	County_x	Population	clean_county	County_y	State
0	DeWitt	16798	dewitt	De Witt County	IL
1	Lac Qui Parle	8067	lacquiparle	Lac qui Parle County	MN
2	Lewis & Clark	55716	lewisandclark	Lewis and Clark County	MT
3	St. John the Baptist	43044	stjohnthebaptist	St John the Baptist Parish	LS

Working with text: string methods and regular expressions¶

Canonicalization with Basic Python¶

Processing Data from a Text Log Using Basic Python¶

Regular Expressions¶

Canonicalization with Regex¶

Extraction with Regex¶

Revisiting Text Log Processing using Regex¶

Regular expressions can be compiled and used as an object¶

Pandas version¶

Real World Example #1: Restaurant Data¶

EDA¶

Bonus Content: Using pd.to_datetime to Extract Time Information¶

	Html
0	<div><td valign="top">Moo</td></div>
1	<a href="http://ds100.org">Link</a>
2	<b>Bold text</b>

	SSN
0	987-65-4321
1	forty
2	123-45-6789 bro or 321-45-6789
3	999-99-9999

	Log
0	169.237.46.168 - - [26/Jan/2014:10:47:58 -0800...
1	193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "...
2	169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800...

		0	1	2	3	4	5	6
	match
0	0	26	Jan	2014	10	47	58	-0800
1	0	2	Feb	2005	17	23	6	-0800
2	0	3	Feb	2006	10	18	37	-0800

	bid	date	desc
0	19	20171211	Inadequate food safety knowledge or lack of ce...
1	19	20171211	Unapproved or unmaintained equipment or utensils
2	19	20160513	Unapproved or unmaintained equipment or utensi...
3	19	20160513	Unclean or degraded floors walls or ceilings ...
4	19	20160513	Food safety certificate or food handler card n...

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
256	489	20150807	1	0	0	1	0	0
257	489	20160308	2	2	1	0	1	0
258	489	20160721	2	1	1	1	0	1
259	489	20161220	3	0	1	2	0	0

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
291	527	20170821	1	1	2	1	1	1
1508	2622	20160526	4	2	2	3	0	0
1573	2721	20150422	2	1	2	1	0	0
1746	2945	20150921	2	1	2	2	2	1

	bid	date	feature	num_vios
255	489	20150728	is_unclean	5
12517	489	20150728	is_high_risk	0
24779	489	20150728	is_vermin	2
37041	489	20150728	is_surface	3
49303	489	20150728	is_human	0
61565	489	20150728	is_permit	0

	bid	score	date
0	19	94	20160513
1	19	94	20171211
2	24	98	20171101
3	24	98	20161005
4	24	96	20160311

	0
0	2014-01-26 10:47:58
1	2005-02-02 17:23:06
2	2006-02-03 10:18:37

	Html
0	Moo
1	Link
2	Bold text

	0	1	2
0	987	65	4321
1	NaN	NaN	NaN
2	123	45	6789
3	999	99	9999

		0	1	2
	match
0	0	987	65	4321
2	0	123	45	6789
2	1	321	45	6789
3	0	999	99	9999