import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
# This option stops scientific notation for pandas
pd.set_option('display.float_format', '{:.2f}'.format)


from ds100_utils import fetch_and_cache

from importlib import reload
reload(utils)


restaurants_file = fetch_and_cache(
    "https://data.sfgov.org/api/views/pyih-qa8i/rows.csv", # DL from this website
    "restaurants.csv",         # save as this file
    force=False)               # do nothing if the file already exists

Using cached version that was downloaded (UTC): Tue Sep  6 02:37:41 2022


import os
print(restaurants_file, "is", os.path.getsize(restaurants_file) / 1e6, "MB")

data/restaurants.csv is 12.093999 MB


with open(restaurants_file, "r") as f:
    print(restaurants_file, "is", sum(1 for l in f), "lines.")

data/restaurants.csv is 53974 lines.


print(restaurants_file, "======================")
with open(restaurants_file, "r") as f:
    for i in range(10):
        print(f"{i:03} | {repr(f.readline())}") # zero pad line numbers

data/restaurants.csv ======================
000 | 'business_id,business_name,business_address,business_city,business_state,business_postal_code,business_latitude,business_longitude,business_location,business_phone_number,inspection_id,inspection_date,inspection_score,inspection_type,violation_id,violation_description,risk_category,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods\n'
001 | '835,Kam Po Kitchen,801 Broadway St,San Francisco,CA,94133,37.797223,-122.410513,POINT (-122.410513 37.797223),,835_20180917,09/17/2018 12:00:00 AM,88,Routine - Unscheduled,835_20180917_103139,Improper food storage,Low Risk,107,107,6,3,6\n'
002 | "905,Working Girls' Cafe',0259 Kearny St,San Francisco,CA,94108,37.790477,-122.404033,POINT (-122.404033 37.790477),,905_20190415,04/15/2019 12:00:00 AM,87,Routine - Unscheduled,905_20190415_103114,High risk vermin infestation,High Risk,19,19,6,3,8\n"
003 | "1203,TAWAN'S THAI FOOD,4403 GEARY Blvd,San Francisco,CA,94118,37.780834,-122.46659,POINT (-122.46659 37.780834),+14155755175,1203_20170803,08/03/2017 12:00:00 AM,77,Routine - Unscheduled,1203_20170803_103120,Moderate risk food holding temperature,Moderate Risk,5,5,8,4,11\n"
004 | '1345,Cordon Bleu,1574 California St,San Francisco,CA,94109,37.790683,-122.420264,POINT (-122.420264 37.790683),,1345_20170928,09/28/2017 12:00:00 AM,81,Routine - Unscheduled,1345_20170928_103105,Improper cooling methods,High Risk,105,105,4,3,21\n'
005 | '1352,LA TORTILLA,495 Castro St B,San Francisco,CA,94114,37.760954,-122.434935,POINT (-122.434935 37.760954),+14155863990,1352_20180620,06/20/2018 12:00:00 AM,74,Routine - Unscheduled,1352_20180620_103177,Non service animal,Low Risk,38,38,3,5,5\n'
006 | '1652,STARBUCKS,201 03rd St,San Francisco,CA,94105,37.785037,-122.400432,POINT (-122.400432 37.785037),,1652_20190611,06/11/2019 12:00:00 AM,96,Routine - Unscheduled,1652_20190611_103161,Low risk vermin infestation,Low Risk,32,32,1,10,8\n'
007 | '1922,All stars Donuts,901 Clement St,San Francisco,CA,94118,37.782694,-122.468969,POINT (-122.468969 37.782694),,1922_20161128,11/28/2016 12:00:00 AM,,Complaint,1922_20161128_103139,Improper food storage,Low Risk,5,5,8,4,11\n'
008 | "3019,Tad's,120 Powell St,San Francisco,CA,94102,37.785928,-122.407887,POINT (-122.407887 37.785928),,3019_20190702,07/02/2019 12:00:00 AM,88,Routine - Unscheduled,3019_20190702_103139,Improper food storage,Low Risk,19,19,6,3,8\n"
009 | '3286,Super Burrito,1200 Polk St,San Francisco,CA,94109,37.787998,-122.420117,POINT (-122.420117 37.787998),,3286_20190214,02/14/2019 12:00:00 AM,81,Routine - Unscheduled,3286_20190214_103105,Improper cooling methods,High Risk,50,50,4,3,21\n'


restaurants = pd.read_csv(restaurants_file)
restaurants.head()


restaurants_tsv_file = "data/restaurants.tsv"  # I processed this earlier

print(restaurants_tsv_file, "======================")
with open(restaurants_tsv_file, "r") as f:
    for i in range(10):
        print(f"{i:03} | {repr(f.readline())}")

data/restaurants.tsv ======================
000 | '\tbusiness_id\tbusiness_name\tbusiness_address\tbusiness_city\tbusiness_state\tbusiness_postal_code\tbusiness_latitude\tbusiness_longitude\tbusiness_location\tbusiness_phone_number\tinspection_id\tinspection_date\tinspection_score\tinspection_type\tviolation_id\tviolation_description\trisk_category\tNeighborhoods\tSF Find Neighborhoods\tCurrent Police Districts\tCurrent Supervisor Districts\tAnalysis Neighborhoods\n'
001 | '0\t835\tKam Po Kitchen\t801 Broadway St\tSan Francisco\tCA\t94133\t37.797223\t-122.410513\tPOINT (-122.410513 37.797223)\t\t835_20180917\t09/17/2018 12:00:00 AM\t88.0\tRoutine - Unscheduled\t835_20180917_103139\tImproper food storage\tLow Risk\t107.0\t107.0\t6.0\t3.0\t6.0\n'
002 | "1\t905\tWorking Girls' Cafe'\t0259 Kearny St\tSan Francisco\tCA\t94108\t37.790477\t-122.404033\tPOINT (-122.404033 37.790477)\t\t905_20190415\t04/15/2019 12:00:00 AM\t87.0\tRoutine - Unscheduled\t905_20190415_103114\tHigh risk vermin infestation\tHigh Risk\t19.0\t19.0\t6.0\t3.0\t8.0\n"
003 | "2\t1203\tTAWAN'S THAI FOOD\t4403 GEARY Blvd\tSan Francisco\tCA\t94118\t37.780834\t-122.46659\tPOINT (-122.46659 37.780834)\t14155755175.0\t1203_20170803\t08/03/2017 12:00:00 AM\t77.0\tRoutine - Unscheduled\t1203_20170803_103120\tModerate risk food holding temperature\tModerate Risk\t5.0\t5.0\t8.0\t4.0\t11.0\n"
004 | '3\t1345\tCordon Bleu\t1574 California St\tSan Francisco\tCA\t94109\t37.790683\t-122.420264\tPOINT (-122.420264 37.790683)\t\t1345_20170928\t09/28/2017 12:00:00 AM\t81.0\tRoutine - Unscheduled\t1345_20170928_103105\tImproper cooling methods\tHigh Risk\t105.0\t105.0\t4.0\t3.0\t21.0\n'
005 | '4\t1352\tLA TORTILLA\t495 Castro St B\tSan Francisco\tCA\t94114\t37.760954\t-122.434935\tPOINT (-122.434935 37.760954)\t14155863990.0\t1352_20180620\t06/20/2018 12:00:00 AM\t74.0\tRoutine - Unscheduled\t1352_20180620_103177\tNon service animal\tLow Risk\t38.0\t38.0\t3.0\t5.0\t5.0\n'
006 | '5\t1652\tSTARBUCKS\t201 03rd St\tSan Francisco\tCA\t94105\t37.785037\t-122.400432\tPOINT (-122.400432 37.785037)\t\t1652_20190611\t06/11/2019 12:00:00 AM\t96.0\tRoutine - Unscheduled\t1652_20190611_103161\tLow risk vermin infestation\tLow Risk\t32.0\t32.0\t1.0\t10.0\t8.0\n'
007 | '6\t1922\tAll stars Donuts\t901 Clement St\tSan Francisco\tCA\t94118\t37.782694\t-122.468969\tPOINT (-122.468969 37.782694)\t\t1922_20161128\t11/28/2016 12:00:00 AM\t\tComplaint\t1922_20161128_103139\tImproper food storage\tLow Risk\t5.0\t5.0\t8.0\t4.0\t11.0\n'
008 | "7\t3019\tTad's\t120 Powell St\tSan Francisco\tCA\t94102\t37.785928\t-122.407887\tPOINT (-122.407887 37.785928)\t\t3019_20190702\t07/02/2019 12:00:00 AM\t88.0\tRoutine - Unscheduled\t3019_20190702_103139\tImproper food storage\tLow Risk\t19.0\t19.0\t6.0\t3.0\t8.0\n"
009 | '8\t3286\tSuper Burrito\t1200 Polk St\tSan Francisco\tCA\t94109\t37.787998\t-122.420117\tPOINT (-122.420117 37.787998)\t\t3286_20190214\t02/14/2019 12:00:00 AM\t81.0\tRoutine - Unscheduled\t3286_20190214_103105\tImproper cooling methods\tHigh Risk\t50.0\t50.0\t4.0\t3.0\t21.0\n'


restaurants_tsv = pd.read_csv(restaurants_tsv_file, delimiter='\t')
restaurants_tsv.head()


# punctuation in a CSV record entry
with open(restaurants_file, "r") as f:
    for i, line in enumerate(f.readlines()):
        if "Mo'z Cafe," in line:
            print(f"{i:03} | {repr(line)}")
            break

062 | '81508,"Mo\'z Cafe, Inc.",36 05th St,San Francisco,CA,94103,,,,,81508_20190516,05/16/2019 12:00:00 AM,84,Routine - Unscheduled,81508_20190516_103162,Other low risk violation,Low Risk,,,,,\n'


# read_csv is smart!
restaurants[restaurants['business_name'].str.contains("Mo'z")].head()


restaurants.describe()


restaurants.columns

Index(['business_id', 'business_name', 'business_address', 'business_city',
       'business_state', 'business_postal_code', 'business_latitude',
       'business_longitude', 'business_location', 'business_phone_number',
       'inspection_id', 'inspection_date', 'inspection_score',
       'inspection_type', 'violation_id', 'violation_description',
       'risk_category', 'Neighborhoods', 'SF Find Neighborhoods',
       'Current Police Districts', 'Current Supervisor Districts',
       'Analysis Neighborhoods'],
      dtype='object')


restaurants['risk_category'].value_counts().to_frame()


restaurants['violation_description'].value_counts().to_frame()


restaurants.query('risk_category == "High Risk"')['violation_description'].value_counts().to_frame()


restaurants['risk_category'].value_counts(dropna=False).to_frame()


restaurants.query('inspection_score == 100')['risk_category'].value_counts(dropna=False).to_frame()


restaurants['inspection_id'].value_counts().to_frame()


restaurants.query('inspection_id == "97283_20180823"')


pd.pivot_table(restaurants.query('business_name == "PICCOLO PETE"'),
            index = 'inspection_id',
            columns = 'risk_category',
            values = 'business_id',
            aggfunc = 'count',
            dropna =False,
            fill_value = 0)


restaurants.query('business_name == "PICCOLO PETE"')['inspection_id'].value_counts().to_frame()


covid_file = fetch_and_cache(
    "https://data.cityofberkeley.info/api/views/xn6j-b766/rows.json?accessType=DOWNLOAD",
    "confirmed-cases.json",
    force=False)

Using cached version that was downloaded (UTC): Tue Sep  6 02:37:41 2022


print(covid_file, "is", os.path.getsize(covid_file) / 1e6, "MB")

with open(covid_file, "r") as f:
    print(covid_file, "is", sum(1 for l in f), "lines.")

data/confirmed-cases.json is 0.116367 MB
data/confirmed-cases.json is 1110 lines.


# raw text
print(covid_file, "======================")
with open(covid_file, "r") as f:
    for i in range(20):
        print(f"{i:03} | {repr(f.readline())}") # zero pad line numbers

data/confirmed-cases.json ======================
000 | '{\n'
001 | '  "meta" : {\n'
002 | '    "view" : {\n'
003 | '      "id" : "xn6j-b766",\n'
004 | '      "name" : "COVID-19 Confirmed Cases",\n'
005 | '      "assetType" : "dataset",\n'
006 | '      "attribution" : "City of Berkeley",\n'
007 | '      "averageRating" : 0,\n'
008 | '      "category" : "Health",\n'
009 | '      "createdAt" : 1587074071,\n'
010 | '      "description" : "Counts of confirmed COVID-19 cases among Berkeley residents by date.",\n'
011 | '      "displayType" : "table",\n'
012 | '      "downloadCount" : 2804,\n'
013 | '      "hideFromCatalog" : false,\n'
014 | '      "hideFromDataJson" : false,\n'
015 | '      "newBackend" : true,\n'
016 | '      "numberOfComments" : 0,\n'
017 | '      "oid" : 37306599,\n'
018 | '      "provenance" : "official",\n'
019 | '      "publicationAppendEnabled" : false,\n'


import json

with open(covid_file, "rb") as f:
    covid_json = json.load(f)


type(covid_json)

dict


covid_json.keys()

dict_keys(['meta', 'data'])


covid_json['meta'].keys()

dict_keys(['view'])


covid_json['meta']['view'].keys()

dict_keys(['id', 'name', 'assetType', 'attribution', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'columns', 'grants', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])


print(covid_json['meta']['view']['description'])

Counts of confirmed COVID-19 cases among Berkeley residents by date.


type(covid_json['meta']['view']['columns'])

list


for i in range(3):
    print(f"{i:03} | {covid_json['data'][i]}")

000 | ['row-kzbg.v7my-c3y2', '00000000-0000-0000-0405-CB14DE51DAA7', 0, 1643733903, None, 1643733903, None, '{ }', '2020-02-28T00:00:00', '1', '1']
001 | ['row-jkyx_9u4r-h2yw', '00000000-0000-0000-F806-86D0DBE0E17F', 0, 1643733903, None, 1643733903, None, '{ }', '2020-02-29T00:00:00', '0', '1']
002 | ['row-qifg_4aug-y3ym', '00000000-0000-0000-2DCE-4D1872F9B216', 0, 1643733903, None, 1643733903, None, '{ }', '2020-03-01T00:00:00', '0', '1']


# Load the data from JSON and assign column titles
covid = pd.DataFrame(
    covid_json['data'],
    columns=[c['name'] for c in covid_json['meta']['view']['columns']])

covid.tail()


co2_file = "data/co2_mm_mlo.txt"


print(co2_file, "is", os.path.getsize(co2_file) / 1e6, "MB")

with open(co2_file, "r") as f:
    print(co2_file, "is", sum(1 for l in f), "lines.")

data/co2_mm_mlo.txt is 0.051131 MB
data/co2_mm_mlo.txt is 810 lines.


co2 = pd.read_csv(
    co2_file, header = None, skiprows = 72,
    sep = '\s+' #regex for continuous whitespace (next lecture)
)
co2.head()


co2.head()


co2 = pd.read_csv(
    co2_file, header = None, skiprows = 72,
    sep = '\s+', #regex for continuous whitespace (next lecture)
    names = ['Yr', 'Mo', 'DecDate', 'Avg', 'Int', 'Trend', 'Days']
)
co2.head()


sns.lineplot(x='DecDate', y='Avg', data=co2);


co2.head()


co2.tail()


co2.shape

(738, 7)


co2["Mo"].value_counts().sort_index()

1     61
2     61
3     62
4     62
5     62
6     62
7     62
8     62
9     61
10    61
11    61
12    61
Name: Mo, dtype: int64


sns.displot(co2['Days']);
plt.title("Distribution of days feature")
plt.show() # suppresses unneeded plotting output


sns.scatterplot(x="Yr", y="Days", data=co2);
plt.title("Day field by Year"); # the ; suppresses output


# Histograms of average CO2 measurements
sns.displot(co2['Avg']);


co2[co2["Avg"] < 0]


sns.lineplot(x='DecDate', y='Avg', data=co2)
plt.title("CO2 Average By Month");


# 1. Drop missing values
co2_drop = co2[co2['Avg'] > 0]

# 2. Replace NaN with -99.99
co2_NA = co2.replace(-99.99, np.NaN)


# 3. Use interpolated column which estimates missing Avg values
co2_impute = co2.copy()
co2_impute['Avg'] = co2['Int']


# results of plotting data in 1958

def line_and_points(data, ax, title):
    # assumes single year, hence Mo
    ax.plot('Mo', 'Avg', data=data)
    ax.scatter('Mo', 'Avg', data=data)
    ax.set_xlim(2, 13)
    ax.set_title(title)
    ax.set_xticks(np.arange(3, 13))

def data_year(data, year):
    return data[data["Yr"] == 1958]
    
# uses matplotlib subplots
# you may see more next week; focus on output for now
fig, axes = plt.subplots(ncols = 3, figsize=(12, 4), sharey=True)

year = 1958
line_and_points(data_year(co2_drop, year), axes[0], title="1. Drop Missing")
line_and_points(data_year(co2_NA, year), axes[1], title="2. Missing Set to NaN")
line_and_points(data_year(co2_impute, year), axes[2], title="3. Missing Interpolated")

fig.suptitle(f"Monthly Averages for {year}")
plt.tight_layout()


sns.lineplot(x='DecDate', y='Avg', data=co2_impute)
plt.title("CO2 Average By Month, Imputed");


co2_year = co2_impute.groupby('Yr').mean()
sns.lineplot(x='Yr', y='Avg', data=co2_year)
plt.title("CO2 Average By Year");

	business_id	business_latitude	business_longitude	business_phone_number	inspection_score	Neighborhoods	SF Find Neighborhoods	Current Police Districts	Current Supervisor Districts	Analysis Neighborhoods
count	53973.00	27475.00	27475.00	17672.00	39541.00	27435.00	27435.00	27447.00	27447.00	27447.00
mean	55218.88	37.75	-122.36	14155413305.28	86.22	56.96	56.96	5.36	5.63	19.46
std	36165.56	0.88	2.86	1356437.67	8.49	34.80	34.80	2.69	3.30	11.62
min	19.00	0.00	-122.51	14150204876.00	45.00	1.00	1.00	1.00	1.00	1.00
25%	7558.00	37.76	-122.44	14155371185.00	81.00	28.00	28.00	3.00	3.00	8.00
50%	69186.00	37.78	-122.42	14155608877.00	87.00	53.00	53.00	6.00	5.00	20.00
75%	86385.00	37.79	-122.41	14155802771.00	92.00	95.00	95.00	8.00	9.00	30.00
max	102705.00	37.82	0.00	14159881393.00	100.00	117.00	117.00	10.00	11.00	41.00

	violation_description
Unclean or degraded floors walls or ceilings	3453
Unapproved or unmaintained equipment or utensils	2917
Inadequately cleaned or sanitized food contact surfaces	2823
Inadequate and inaccessible handwashing facilities	2727
Moderate risk food holding temperature	2623
...	...
Improper cooking time or temperatures	6
Mobile food facility stored in unapproved location	5
Improperly displayed mobile food permit or signage	5
Noncompliance with Gulf Coast oyster regulation	4
No restroom facility within 200 feet of mobile food facility	1

	violation_description
High risk food holding temperature	1620
Unclean or unsanitary food contact surfaces	1245
Improper cooling methods	910
High risk vermin infestation	822
Unclean hands or improper use of gloves	660
No hot water or running water	159
Contaminated or adulterated food	133
Improper reheating of food	120
Sewage or wastewater contamination	36
Unauthorized or unsafe use of time as a public health control measure	34
Other high risk violation	28
Mobile food facility not operating with an approved commissary	24
Unapproved food source	17
Mobile food facility with unapproved operating conditions	10
Improper cooking time or temperatures	6
No restroom facility within 200 feet of mobile food facility	1

	inspection_id
97283_20180823	15
984_20170726	14
10877_20190701	14
91843_20180822	14
76218_20190124	14
...	...
100274_20190509	1
71131_20161012	1
5938_20170921	1
5808_20190918	1
27821_20180720	1

	inspection_id
97283_20180823	15
97283_20190604	4
97283_20190716	1
97283_20190626	1
97283_20180913	1

Lecture 4 – Data 100, Fall 2022¶

Structure: File Formats¶

Reproducible Data Science¶

Downloading the Data¶

1) CSV: Restaurant data¶

1. How big is the data?¶

2. How is the data file formatted?¶

3. How do we read the data into pandas?¶

Side Note: TSV¶

4. What's in the data set?¶

2) JSON¶

1. How big is the data file?¶

2. How is the data file formatted?¶

3. How do we read the file into Pandas?¶

Digging into JSON¶

Examine what keys are in the top level json object¶

Columns Metadata¶

Examining the Data Field for Records¶

3. How do we read the file into Pandas? Building a DataFrame from JSON¶

Your turn: Mauna Loa CO2 data¶

How do we read the file into Pandas?¶

¶

¶

Real World Example: Wrangling CO2 Measurements¶

Exploring Variable Feature Types¶

Let's start exploring!!¶

Quality Checks: Reasoning about the data¶

Understanding Missing Value 1: `Days`¶

Understanding Missing Value 2: `Avg`¶

Drop or Impute Missing `Avg` Data?¶

Presenting the data: A Discussion on Data Granularity¶

	business_id	business_name	business_address	business_city	business_state	business_postal_code	business_latitude	business_longitude	business_location	business_phone_number	inspection_id	inspection_date	inspection_score	inspection_type	violation_id	violation_description	risk_category	Neighborhoods	SF Find Neighborhoods	Current Police Districts	Current Supervisor Districts	Analysis Neighborhoods
0	835	Kam Po Kitchen	801 Broadway St	San Francisco	CA	94133	37.80	-122.41	POINT (-122.410513 37.797223)	NaN	835_20180917	09/17/2018 12:00:00 AM	88.00	Routine - Unscheduled	835_20180917_103139	Improper food storage	Low Risk	107.00	107.00	6.00	3.00	6.00
1	905	Working Girls' Cafe'	0259 Kearny St	San Francisco	CA	94108	37.79	-122.40	POINT (-122.404033 37.790477)	NaN	905_20190415	04/15/2019 12:00:00 AM	87.00	Routine - Unscheduled	905_20190415_103114	High risk vermin infestation	High Risk	19.00	19.00	6.00	3.00	8.00
2	1203	TAWAN'S THAI FOOD	4403 GEARY Blvd	San Francisco	CA	94118	37.78	-122.47	POINT (-122.46659 37.780834)	14155755175.00	1203_20170803	08/03/2017 12:00:00 AM	77.00	Routine - Unscheduled	1203_20170803_103120	Moderate risk food holding temperature	Moderate Risk	5.00	5.00	8.00	4.00	11.00
3	1345	Cordon Bleu	1574 California St	San Francisco	CA	94109	37.79	-122.42	POINT (-122.420264 37.790683)	NaN	1345_20170928	09/28/2017 12:00:00 AM	81.00	Routine - Unscheduled	1345_20170928_103105	Improper cooling methods	High Risk	105.00	105.00	4.00	3.00	21.00
4	1352	LA TORTILLA	495 Castro St B	San Francisco	CA	94114	37.76	-122.43	POINT (-122.434935 37.760954)	14155863990.00	1352_20180620	06/20/2018 12:00:00 AM	74.00	Routine - Unscheduled	1352_20180620_103177	Non service animal	Low Risk	38.00	38.00	3.00	5.00	5.00

	business_id	business_name	business_address	business_city	business_state	business_postal_code	business_latitude	business_longitude	business_location	business_phone_number	inspection_id	inspection_date	inspection_score	inspection_type	violation_id	violation_description	risk_category	Neighborhoods	SF Find Neighborhoods	Current Police Districts	Current Supervisor Districts	Analysis Neighborhoods
61	81508	Mo'z Cafe, Inc.	36 05th St	San Francisco	CA	94103	NaN	NaN	NaN	NaN	81508_20190516	05/16/2019 12:00:00 AM	84.00	Routine - Unscheduled	81508_20190516_103162	Other low risk violation	Low Risk	NaN	NaN	NaN	NaN	NaN
18913	91619	Mo'z Cafe S.F. Inc	5 Masonic Ave	San Francisco	CA	94118	NaN	NaN	NaN	NaN	91619_20190410	04/10/2019 12:00:00 AM	72.00	Routine - Unscheduled	91619_20190410_103119	Inadequate and inaccessible handwashing facili...	Moderate Risk	NaN	NaN	NaN	NaN	NaN
20034	91619	Mo'z Cafe S.F. Inc	5 Masonic Ave	San Francisco	CA	94118	NaN	NaN	NaN	NaN	91619_20180716	07/16/2018 12:00:00 AM	83.00	Routine - Unscheduled	91619_20180716_103109	Unclean or unsanitary food contact surfaces	High Risk	NaN	NaN	NaN	NaN	NaN
22508	91619	Mo'z Cafe S.F. Inc	5 Masonic Ave	San Francisco	CA	94118	NaN	NaN	NaN	NaN	91619_20190410	04/10/2019 12:00:00 AM	72.00	Routine - Unscheduled	91619_20190410_103144	Unapproved or unmaintained equipment or utensils	Low Risk	NaN	NaN	NaN	NaN	NaN
23468	81508	Mo'z Cafe, Inc.	36 05th St	San Francisco	CA	94103	NaN	NaN	NaN	NaN	81508_20170927	09/27/2017 12:00:00 AM	80.00	Routine - Unscheduled	81508_20170927_103103	High risk food holding temperature	High Risk	NaN	NaN	NaN	NaN	NaN

	business_id	business_name	business_address	business_city	business_state	business_postal_code	business_latitude	business_longitude	business_location	business_phone_number	inspection_id	inspection_date	inspection_score	inspection_type	violation_id	violation_description	risk_category	Neighborhoods	SF Find Neighborhoods	Current Police Districts	Current Supervisor Districts	Analysis Neighborhoods
38245	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103150	Improper or defective plumbing	Low Risk	NaN	NaN	NaN	NaN	NaN
39468	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103110	Unapproved food source	High Risk	NaN	NaN	NaN	NaN	NaN
40357	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103130	Inadequate sewage or wastewater disposal	Moderate Risk	NaN	NaN	NaN	NaN	NaN
41306	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103151	Inadequate or unsanitary refuse containers or ...	Low Risk	NaN	NaN	NaN	NaN	NaN
42369	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103154	Unclean or degraded floors walls or ceilings	Low Risk	NaN	NaN	NaN	NaN	NaN
43773	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103116	Inadequate food safety knowledge or lack of ce...	Moderate Risk	NaN	NaN	NaN	NaN	NaN
44117	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103119	Inadequate and inaccessible handwashing facili...	Moderate Risk	NaN	NaN	NaN	NaN	NaN
45128	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103143	Inadequate warewashing facilities or equipment	Low Risk	NaN	NaN	NaN	NaN	NaN
46756	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103142	Unclean nonfood contact surfaces	Low Risk	NaN	NaN	NaN	NaN	NaN
47632	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103165	No plan review or Building Permit	Low Risk	NaN	NaN	NaN	NaN	NaN
47855	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103147	Inadequate ventilation or lighting	Low Risk	NaN	NaN	NaN	NaN	NaN
49987	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103149	Wiping cloths not clean or properly stored or ...	Low Risk	NaN	NaN	NaN	NaN	NaN
51065	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103103	High risk food holding temperature	High Risk	NaN	NaN	NaN	NaN	NaN
52654	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103144	Unapproved or unmaintained equipment or utensils	Low Risk	NaN	NaN	NaN	NaN	NaN
53482	97283	PICCOLO PETE	2155 Bayshore Blvd	San Francisco	CA	94134	NaN	NaN	NaN	14155793720.00	97283_20180823	08/23/2018 12:00:00 AM	NaN	New Ownership	97283_20180823_103133	Foods not protected from contamination	Moderate Risk	NaN	NaN	NaN	NaN	NaN

	sid	id	created_at	created_meta	updated_at	updated_meta	meta	Date	New Cases	Cumulative Cases
699	row-49b6_x8zv.gyum	00000000-0000-0000-A18C-9174A6D05774	1643733903	None	1643733903	None	{ }	2022-01-27T00:00:00	106	10694
700	row-gs55-p5em.y4v9	00000000-0000-0000-F41D-5724AEABB4D6	1643733903	None	1643733903	None	{ }	2022-01-28T00:00:00	223	10917
701	row-3pyj.tf95-qu67	00000000-0000-0000-BEE3-B0188D2518BD	1643733903	None	1643733903	None	{ }	2022-01-29T00:00:00	139	11056
702	row-cgnd.8syv.jvjn	00000000-0000-0000-C318-63CF75F7F740	1643733903	None	1643733903	None	{ }	2022-01-30T00:00:00	33	11089
703	row-qywv_24x6-237y	00000000-0000-0000-FE92-9789FED3AA20	1643733903	None	1643733903	None	{ }	2022-01-31T00:00:00	42	11131

	0	1	2	3	4	5	6
0	1958	3	1958.21	315.71	315.71	314.62	-1
1	1958	4	1958.29	317.45	317.45	315.29	-1
2	1958	5	1958.38	317.50	317.50	314.71	-1
3	1958	6	1958.46	-99.99	317.10	314.85	-1
4	1958	7	1958.54	315.86	315.86	314.98	-1

	Yr	Mo	DecDate	Avg	Int	Trend	Days
733	2019	4	2019.29	413.32	413.32	410.49	26
734	2019	5	2019.38	414.66	414.66	411.20	28
735	2019	6	2019.46	413.92	413.92	411.58	27
736	2019	7	2019.54	411.77	411.77	411.43	23
737	2019	8	2019.62	409.95	409.95	411.84	29

	Yr	Mo	DecDate	Avg	Int	Trend	Days
3	1958	6	1958.46	-99.99	317.10	314.85	-1
7	1958	10	1958.79	-99.99	312.66	315.61	-1
71	1964	2	1964.12	-99.99	320.07	319.61	-1
72	1964	3	1964.21	-99.99	320.73	319.55	-1
73	1964	4	1964.29	-99.99	321.77	319.48	-1
213	1975	12	1975.96	-99.99	330.59	331.60	0
313	1984	4	1984.29	-99.99	346.84	344.27	2

Lecture 4 – Data 100, Fall 2022¶

Structure: File Formats¶

Reproducible Data Science¶

Downloading the Data¶

1) CSV: Restaurant data¶

1. How big is the data?¶

2. How is the data file formatted?¶

3. How do we read the data into pandas?¶

Side Note: TSV¶

4. What's in the data set?¶

2) JSON¶

1. How big is the data file?¶

2. How is the data file formatted?¶

3. How do we read the file into Pandas?¶

Digging into JSON¶

Examine what keys are in the top level json object¶

Columns Metadata¶

Examining the Data Field for Records¶

3. How do we read the file into Pandas? Building a DataFrame from JSON¶

Your turn: Mauna Loa CO2 data¶

How do we read the file into Pandas?¶

¶

¶

Real World Example: Wrangling CO2 Measurements¶

Exploring Variable Feature Types¶

Let's start exploring!!¶

Quality Checks: Reasoning about the data¶

Understanding Missing Value 1: Days¶

Understanding Missing Value 2: Avg¶

Drop or Impute Missing Avg Data?¶

Presenting the data: A Discussion on Data Granularity¶

Understanding Missing Value 1: `Days`¶

Understanding Missing Value 2: `Avg`¶

Drop or Impute Missing `Avg` Data?¶