import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
sns.set_context('poster')
import ipywidgets as widgets
from ipywidgets import interact
import nbinteract as nbi
pd.options.display.max_rows = 10
pd.options.display.max_columns = 8
5-10 min
# Issues:
# Capitalization
# Spaces
# Punctuation
# Extra characters
counties1 = [
'De Witt County',
'Lac qui Parle County',
'Lewis and Clark County',
'St John the Baptist Parish',
]
counties2 = [
'DeWitt ',
'Lac Qui Parle',
'Lewis & Clark ',
'St. John the Baptist',
]
Python string methods are useful: https://docs.python.org/3/library/stdtypes.html#string-methods
In particular, string replacements, deletions, and transformations are easy to do.
county = counties1[0]
county
county[:-7].lower().replace(' ', '')
[county[:-7].lower().replace(' ', '') for county in counties1]
county2 = counties2[0]
county2
(county2
.lower()
.strip()
.replace('&', 'and')
.replace(' ', '')
.replace('.', '')
)
[county2
.lower()
.strip()
.replace('&', 'and')
.replace(' ', '')
.replace('.', '')
for county2 in counties2]
def clean_county(county):
return (county
.lower()
.strip()
.replace(' county', '')
.replace(' parish', '')
.replace('&', 'and')
.replace(' ', '')
.replace('.', ''))
[clean_county(county) for county in counties1], [clean_county(county) for county in counties2]
10-15 min
!head data/smallLog.txt
# Small files can be read in completely
with open('data/smallLog.txt') as f:
log = f.readlines()
log
How can I extract the date and time?
As before, we'll start with a single line.
line = log[0]
line
line.split(' [')
(line
.split(' [')[1]
.split('] ')[0]
)
Works, but what if I want the individual components? E.g.
['26', 'Jan', '2014', '10', '47', '58', '-0800']
# Wow, what a pain...
day, month, rest = (line
.split(' [')[1]
.split('] ')[0]
.split('/')
)
year, hour, minute, rest = rest.split(':')
sec, timezone = rest.split(' ')
day, month, year, hour, minute, sec, timezone
Wouldn't it be great to tell Python to just match this general pattern?
[(day)/(month)/(year):(hour):(min):(sec) (timezone)]
15-40 mins
Regular expressions let you specify a pattern for a string. Follow attentively!
import re
re.findall('\[26/Jan/2014:10:47:58 -0800\]', line)
re.findall('\[(26)/(Jan)/(2014):(10):(47):(58) (-0800)\]', line)
The .
character is a wildcard (anything goes except newlines):
re.findall('\[(..)/(...)/(....):(..):(..):(..) (.....)\]', line)
# Hmmm
[re.findall('\[(..)/(...)/(....):(..):(..):(..) (.....)\]', line)
for line in log]
# Only one digit for month!
log[1]
# Ok!
[re.findall('\[(.+)/(.+)/(.+):(.+):(.+):(.+) (.+)\]', line)
for line in log]
Discussion question: What happens if we remove the brackets?
re.findall('(.+)/(.+)/(.+):(.+):(.+):(.+) (.+)', line)
Make the regex more precise:
[re.findall(
'\[(\d+)/([a-zA-z]+)/(\d+):(\d+):(\d+):(\d+) (.\d+)\]',
line
) for line in log]
html = '<div class="js-tweet-text-container"><p class="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" lang="en" data-aria-label-part="0">Today, I was honored to be joined by Republicans and Democrats from both the House and Senate, as well as members of my Cabinet - to discuss the urgent need to rebuild and restore America’s depleted infrastructure. <a href="https://t.co/8ByoQJsjTT" rel="nofollow noopener" dir="ltr" data-expanded-url="http://45.wh.gov/UDL9yE" class="twitter-timeline-link" target="_blank" title="http://45.wh.gov/UDL9yE"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">45.wh.gov/UDL9yE</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible"> </span></span></a><a href="https://t.co/BVBRDvHfcC" class="twitter-timeline-link u-hidden" data-pre-embedded="true" dir="ltr">pic.twitter.com/BVBRDvHfcC</a></p></div>'
html_re = '<[^>]+>'
re.sub(html_re, '', html)
urls = [
'ftp://file_server.com:21/top_secret/life_changing_plans.pdf',
'https://regexone.com/lesson/introduction#section',
'file://localhost:4040/zip_file',
'https://s3cur3-server.com:9999/',
'market://search/angry%20birds',
]
url_re = '(\w+)://([\w\-.]+):?(\d+)?/'
[re.findall(url_re, url) for url in urls]
55-85 min
vio = pd.read_csv('data/violations.csv', header=0, names=['id', 'date', 'desc'])
desc = vio['desc']
vio.head()
counts = desc.value_counts()
counts[:10]
# Hmmm...
counts[50:60]
pd.Series([re.sub('\s*\[.*\]$', '', violation) for violation in desc])
pandas has built-in string functions that operate like a loop:
desc.str
# Equivalent to
# pd.Series([len(violation) for violation in desc])
desc.str.len()
# Equivalent to
# pd.Series([violation[0] for violation in desc])
desc.str[0]
# Equivalent to
# pd.Series([re.sub('\s*\[.*\]$', '', violation) for violation in desc])
desc.str.replace('\s*\[.*\]$', '')
only_desc = (desc
.str.replace('\s*\[.*\]$', '')
.str.strip()
.str.lower())
only_desc
for v in only_desc.value_counts().head(20).index:
print(v)
Let's define some features.
vio['desc'] = only_desc
vio.head()
with_features = (vio
.assign(is_clean = only_desc.str.contains('clean|sanit'))
.assign(is_high_risk = only_desc.str.contains('high risk'))
.assign(is_vermin = only_desc.str.contains('vermin'))
.assign(is_surface = only_desc.str.contains('wall|ceiling|floor|surface'))
.assign(is_human = only_desc.str.contains('hand|glove|hair|nail'))
.assign(is_permit = only_desc.str.contains('permit|certif'))
)
with_features.head()
Now let's see how which violations are most detrimental to the inspection scores:
ins = pd.read_csv('data/inspections.csv',
header=0,
usecols=[0, 1, 2],
names=['id', 'score', 'date'])
ins.head()
count_features = (with_features
.groupby(['id', 'date'])
.sum()
.reset_index()
)
count_features.head()
ins = pd.read_csv('data/inspections.csv',
header=0,
usecols=[0, 1, 2],
names=['id', 'score', 'date'])
ins.head()
with_scores = (
pd.melt(count_features, id_vars=['id', 'date'],
var_name='feature', value_name='num_vios')
.merge(ins, left_on=['id', 'date'], right_on=['id', 'date'])
)
with_scores.head()
sns.factorplot(x='num_vios', y='score',
col='feature', col_wrap=3,
kind='box',
data=with_scores)