import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import zipfile


log_fname = 'data/log.txt'
!cat {log_fname}

169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"
193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"
169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"


with open(log_fname, 'r') as f:
    log_lines = f.readlines()


log_lines

['169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n',
 '193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n',
 '169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"\n']


log_lines[0][20:31]

'26/Jan/2014'


log_lines[1][20:31]

'/Feb/2005:1'


first = log_lines[0]
first

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'


pertinent = first.split("[")[1].split(']')[0]
day, month, rest = pertinent.split('/')
year, hour, minute, rest = rest.split(':')
seconds, time_zone = rest.split(' ')
day, month, year, hour, minute, seconds, time_zone

('26', 'Jan', '2014', '10', '47', '58', '-0800')


import re

text = "My social security number is 123-45-6789 bro, or actually maybe it’s 321-45-6789.";
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
re.findall(pattern, text)

['123-45-6789', '321-45-6789']


df_ssn = pd.DataFrame(
    ['987-65-4321',
     'forty',
     '123-45-6789 bro or 321-45-6789',
     '999-99-9999'],
    columns=['SSN'])
df_ssn


# -> Series of lists
pattern = r"[0-9]{3}-[0-9]{2}-[0-9]{4}"
df_ssn['SSN'].str.findall(pattern)

0                 [987-65-4321]
1                            []
2    [123-45-6789, 321-45-6789]
3                 [999-99-9999]
Name: SSN, dtype: object


# -> DataFrame of first match group
pattern_group = r"([0-9]{3}-[0-9]{2}-[0-9]{4})" # 1 group
df_ssn['SSN'].str.extract(pattern_group)


text = """Observations: 03:04:53 - Horse awakens.
03:05:14 - Horse goes back to sleep."""       
pattern = r"(\d\d):(\d\d):(\d\d) - (.*)"
matches = re.findall(pattern, text)
matches

[('03', '04', '53', 'Horse awakens.'),
 ('03', '05', '14', 'Horse goes back to sleep.')]


# the four capture groups in the first matched string
hour, minute, second, description = matches[0]


# back to SSNs
df_ssn


# Will extract first match of all groups
pattern_group_mult = r"([0-9]{3})-([0-9]{2})-([0-9]{4})" # 3 groups
df_ssn['SSN'].str.extract(pattern_group_mult)


# -> DataFrame, one row per match
df_ssn['SSN'].str.extractall(pattern_group_mult)


text = '<div><td valign="top">Moo</td></div>'
pattern = r"<[^>]+>"
re.sub(pattern, '', text)

'Moo'


# example dataframe of strings
df_html = pd.DataFrame(['<div><td valign="top">Moo</td></div>',
                   '<a href="http://ds100.org">Link</a>',
                   '<b>Bold text</b>'], columns=['Html'])
df_html


# Series -> Series
df_html["Html"].str.replace(pattern, '', regex=True).to_frame()


line = log_lines[0]
display(line)

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, line)[0] # get first match
day, month, year, hour, minute, second, time_zone

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

('26', 'Jan', '2014', '10', '47', '58', '-0800')


df = pd.DataFrame(log_lines, columns=['Log'])
df


pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
df['Log'].str.findall(pattern)

0    [(26, Jan, 2014, 10, 47, 58, -0800)]
1      [(2, Feb, 2005, 17, 23, 6, -0800)]
2     [(3, Feb, 2006, 10, 18, 37, -0800)]
Name: Log, dtype: object


df['Log'].str.extractall(pattern)


# your code here
...


rx = re.compile(pattern)
rx

re.compile(r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]', re.UNICODE)


rx.search(line)

<re.Match object; span=(19, 47), match='[26/Jan/2014:10:47:58 -0800]'>


out = rx.search(line)


out = rx.search(line)
out.group(0)

'[26/Jan/2014:10:47:58 -0800]'


inputs = [line, "blah blah blah"]
for l in inputs:
    out = rx.search(l)
    if out:
        print(out.group(0))
    else:
        print(f'*** No match for: {l[0:5]} ...')

[26/Jan/2014:10:47:58 -0800]
*** No match for: blah  ...


# beyond the scope of lecture, but left here for your interest
day, month, year, hour, minute, second, time_zone = re.search(pattern, line).groups()
day, month, year, hour, minute, second, time_zone

('26', 'Jan', '2014', '10', '47', '58', '-0800')


vio = pd.read_csv('data/violations.csv', header=0, names=['bid', 'date', 'desc'])
desc = vio['desc']
vio.head()


counts = desc.value_counts()
counts.shape

(14253,)


counts[:10]

Unclean or degraded floors walls or ceilings                          999
Unapproved or unmaintained equipment or utensils                      659
Inadequately cleaned or sanitized food contact surfaces               493
Improper food storage                                                 476
Inadequate and inaccessible handwashing facilities                    467
Moderate risk food holding temperature                                452
Wiping cloths not clean or properly stored or inadequate sanitizer    418
Moderate risk vermin infestation                                      374
Unclean nonfood contact surfaces                                      369
Food safety certificate or food handler card not available            353
Name: desc, dtype: int64


# Hmmm...
counts[50:60]

Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/29/2017 ]              16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/19/2017 ]               16
Inadequate HACCP plan record keeping                                                                16
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/27/2017 ]              15
Unclean or degraded floors walls or ceilings  [ date violation corrected: 12/7/2017 ]               15
Inadequately cleaned or sanitized food contact surfaces  [ date violation corrected: 9/26/2017 ]    14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 11/28/2017 ]              14
Unclean or degraded floors walls or ceilings  [ date violation corrected: 9/6/2017 ]                14
Unapproved or unmaintained equipment or utensils  [ date violation corrected: 9/19/2017 ]           14
Unapproved  living quarters in food facility                                                        13
Name: desc, dtype: int64


# Use regular expressions to cut out the extra info in square braces.
vio['clean_desc'] = (vio['desc']
             .str.replace(r'\s*\[.*\]$', '', regex=True)
             .str.strip()       # removes leading/trailing whitespace
             .str.lower())
vio.head()


# canonicalizing definitely helped
vio['clean_desc'].value_counts().shape

(68,)


vio['clean_desc'].value_counts().head()

unclean or degraded floors walls or ceilings               3507
moderate risk food holding temperature                     2542
inadequate and inaccessible handwashing facilities         2529
unapproved or unmaintained equipment or utensils           2382
inadequately cleaned or sanitized food contact surfaces    2301
Name: clean_desc, dtype: int64


# use regular expressions to assign new features for the presence of various keywords
# regex metacharacter | 
with_features = (vio
 .assign(is_unclean     = vio['clean_desc'].str.contains('clean|sanit'))
 .assign(is_high_risk = vio['clean_desc'].str.contains('high risk'))
 .assign(is_vermin    = vio['clean_desc'].str.contains('vermin'))
 .assign(is_surface   = vio['clean_desc'].str.contains('wall|ceiling|floor|surface'))
 .assign(is_human     = vio['clean_desc'].str.contains('hand|glove|hair|nail'))
 .assign(is_permit    = vio['clean_desc'].str.contains('permit|certif'))
)
with_features.head()


count_features = (with_features
 .groupby(['bid', 'date'])
 .sum()
 .reset_index()
)
count_features.iloc[255:260, :]


count_features[count_features['is_vermin'] > 1].head(5)


violation_type_df = pd.melt(count_features, id_vars=['bid', 'date'],
            var_name='feature', value_name='num_vios')

# show a particular inspection's results
violation_type_df[(violation_type_df['bid'] == 489) & (violation_type_df['date'] == 20150728)]


# read in the scores
inspection_df = pd.read_csv('data/inspections.csv',
                  header=0,
                  usecols=[0, 1, 2],
                  names=['bid', 'score', 'date'])
inspection_df.head()


# join scores with the table broken down by violation type
violation_type_and_scores = (
    violation_type_df
    .merge(inspection_df, on=['bid', 'date'])
)
violation_type_and_scores.head(12)


# you will learn this syntax next week. Focus on interpreting for now.
sns.catplot(x='num_vios', y='score',
               col='feature', col_wrap=2,
               kind='box',
               data=violation_type_and_scores);


pd.Series(log_lines).str.extract(r'\[(.*) -0800\]').apply(
    lambda s: pd.to_datetime(s, format='%d/%b/%Y:%H:%M:%S'))

	Log
0	169.237.46.168 - - [26/Jan/2014:10:47:58 -0800...
1	193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "...
2	169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800...

	bid	date	desc	clean_desc	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
0	19	20171211	Inadequate food safety knowledge or lack of ce...	inadequate food safety knowledge or lack of ce...	False	False	False	False	False	True
1	19	20171211	Unapproved or unmaintained equipment or utensils	unapproved or unmaintained equipment or utensils	False	False	False	False	False	False
2	19	20160513	Unapproved or unmaintained equipment or utensi...	unapproved or unmaintained equipment or utensils	False	False	False	False	False	False
3	19	20160513	Unclean or degraded floors walls or ceilings ...	unclean or degraded floors walls or ceilings	True	False	False	True	False	False
4	19	20160513	Food safety certificate or food handler card n...	food safety certificate or food handler card n...	False	False	False	False	True	True

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
291	527	20170821	1	1	2	1	1	1
1508	2622	20160526	4	2	2	3	0	0
1573	2721	20150422	2	1	2	1	0	0
1746	2945	20150921	2	1	2	2	2	1

Lecture 6 – Data 100, Spring 2023¶

Extracting Log Data¶

Regular Expressions¶

String Extraction with Regex¶

Regex Groups (Capture groups)¶

Canonicalization with Regex¶

Revisiting Text Log Processing using Regex¶

Pandas version¶

Bonus: Regular expressions can be compiled and used as an object¶

Real World Example #1: Restaurant Data¶

EDA¶

Bonus Content: Using pd.to_datetime to Extract Time Information¶

	SSN
0	987-65-4321
1	forty
2	123-45-6789 bro or 321-45-6789
3	999-99-9999

	Html
0	<div><td valign="top">Moo</td></div>
1	<a href="http://ds100.org">Link</a>
2	<b>Bold text</b>

		0	1	2	3	4	5	6
	match
0	0	26	Jan	2014	10	47	58	-0800
1	0	2	Feb	2005	17	23	6	-0800
2	0	3	Feb	2006	10	18	37	-0800

	bid	date	is_unclean	is_high_risk	is_vermin	is_surface	is_human	is_permit
255	489	20150728	5	0	2	3	0	0
256	489	20150807	1	0	0	1	0	0
257	489	20160308	2	2	1	0	1	0
258	489	20160721	2	1	1	1	0	1
259	489	20161220	3	0	1	2	0	0

	bid	date	feature	num_vios
255	489	20150728	is_unclean	5
12517	489	20150728	is_high_risk	0
24779	489	20150728	is_vermin	2
37041	489	20150728	is_surface	3
49303	489	20150728	is_human	0
61565	489	20150728	is_permit	0

	bid	score	date
0	19	94	20160513
1	19	94	20171211
2	24	98	20171101
3	24	98	20161005
4	24	96	20160311

	0
0	2014-01-26 10:47:58
1	2005-02-02 17:23:06
2	2006-02-03 10:18:37

	0	1	2
0	987	65	4321
1	NaN	NaN	NaN
2	123	45	6789
3	999	99	9999

		0	1	2
	match
0	0	987	65	4321
2	0	123	45	6789
2	1	321	45	6789
3	0	999	99	9999

	Html
0	Moo
1	Link
2	Bold text