import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
# This option stops scientific notation for pandas
pd.set_option('display.float_format', '{:.2f}'.format)

# Silence some spurious seaborn warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


tuberculosis_df_tsv = pd.read_csv("data/cdc_tuberculosis.tsv", sep='\t')
tuberculosis_df_tsv.head()


with open("data/cdc_tuberculosis.tsv", "r") as f:
    for _, row in zip(range(4), f):
        print(repr(row)) # print raw strings

'\tNo. of TB cases\t\t\tTB incidence\t\t\n'
'U.S. jurisdiction\t2019\t2020\t2021\t2019\t2020\t2021\n'
'Total\t"8,900"\t"7,173"\t"7,860"\t2.71\t2.16\t2.37\n'
'Alabama\t87\t72\t92\t1.77\t1.43\t1.83\n'


# just run this cell
from ds100_utils import fetch_and_cache

from importlib import reload
reload(utils)


covid_file = fetch_and_cache(
    "https://data.cityofberkeley.info/api/views/xn6j-b766/rows.json?accessType=DOWNLOAD",
    "confirmed-cases.json",
    force=False)
covid_file          # a file path wrapper object

Downloading... Done!

PosixPath('data/confirmed-cases.json')


import os

print(covid_file, "is", os.path.getsize(covid_file) / 1e6, "MB")

data/confirmed-cases.json is 0.239103 MB


!ls -lh {covid_file}
!wc -l {covid_file}

-rw-r--r-- 1 jovyan jovyan 234K Jan 30 05:15 data/confirmed-cases.json
1930 data/confirmed-cases.json


with open(covid_file, "r") as f:
    for i, row in enumerate(f):
        print(repr(row)) # print raw strings
        if i >= 4: break

'{\n'
'  "meta" : {\n'
'    "view" : {\n'
'      "id" : "xn6j-b766",\n'
'      "name" : "COVID-19 Confirmed Cases",\n'


!head -5 {covid_file}

{
  "meta" : {
    "view" : {
      "id" : "xn6j-b766",
      "name" : "COVID-19 Confirmed Cases",


import json

with open(covid_file, "rb") as f:
    covid_json = json.load(f)


type(covid_json)

dict


covid_json.keys()

dict_keys(['meta', 'data'])


covid_json['meta'].keys()

dict_keys(['view'])


covid_json['meta']['view'].keys()

dict_keys(['id', 'name', 'assetType', 'attribution', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'clientContext', 'columns', 'grants', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])


print(covid_json['meta']['view']['description'])

Counts of confirmed COVID-19 cases among Berkeley residents by date. As of 6/21/22, this dataset will be updated weekly instead of daily. As of 11/14/22, this dataset only includes PCR cases.


for i in range(3):
    print(f"{i:03} | {covid_json['data'][i]}")

000 | ['row-m3yq.jwuz.yiqh', '00000000-0000-0000-D599-1DEC218DF070', 0, 1706547174, None, 1706547174, None, '{ }', '2019-12-01T00:00:00', '0', '0']
001 | ['row-vkyg-su95-4ay7', '00000000-0000-0000-4493-69E118405BFA', 0, 1706547174, None, 1706547174, None, '{ }', '2019-12-02T00:00:00', '0', '0']
002 | ['row-rgxq.auh3~4ndv', '00000000-0000-0000-1635-FEBBFAC1DFAC', 0, 1706547174, None, 1706547174, None, '{ }', '2019-12-03T00:00:00', '0', '0']


covid_json['meta']['view']['columns']

[{'id': -1,
  'name': 'sid',
  'dataTypeName': 'meta_data',
  'fieldName': ':sid',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'id',
  'dataTypeName': 'meta_data',
  'fieldName': ':id',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'position',
  'dataTypeName': 'meta_data',
  'fieldName': ':position',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'created_at',
  'dataTypeName': 'meta_data',
  'fieldName': ':created_at',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'created_meta',
  'dataTypeName': 'meta_data',
  'fieldName': ':created_meta',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'updated_at',
  'dataTypeName': 'meta_data',
  'fieldName': ':updated_at',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'updated_meta',
  'dataTypeName': 'meta_data',
  'fieldName': ':updated_meta',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': -1,
  'name': 'meta',
  'dataTypeName': 'meta_data',
  'fieldName': ':meta',
  'position': 0,
  'renderTypeName': 'meta_data',
  'format': {},
  'flags': ['hidden']},
 {'id': 542388895,
  'name': 'Date',
  'dataTypeName': 'calendar_date',
  'description': 'Dates reflect the date created in CalREDIE. ',
  'fieldName': 'date',
  'position': 1,
  'renderTypeName': 'calendar_date',
  'tableColumnId': 138291427,
  'cachedContents': {'non_null': '1520',
   'largest': '2024-01-28T00:00:00.000',
   'null': '0',
   'top': [{'item': '2023-02-27T00:00:00.000', 'count': '1'},
    {'item': '2023-03-15T00:00:00.000', 'count': '1'},
    {'item': '2023-04-04T00:00:00.000', 'count': '1'},
    {'item': '2021-01-30T00:00:00.000', 'count': '1'},
    {'item': '2023-04-27T00:00:00.000', 'count': '1'},
    {'item': '2020-05-08T00:00:00.000', 'count': '1'},
    {'item': '2023-12-02T00:00:00.000', 'count': '1'},
    {'item': '2022-05-03T00:00:00.000', 'count': '1'},
    {'item': '2021-06-04T00:00:00.000', 'count': '1'},
    {'item': '2020-10-28T00:00:00.000', 'count': '1'},
    {'item': '2021-05-04T00:00:00.000', 'count': '1'},
    {'item': '2020-01-13T00:00:00.000', 'count': '1'},
    {'item': '2022-03-12T00:00:00.000', 'count': '1'},
    {'item': '2022-05-13T00:00:00.000', 'count': '1'},
    {'item': '2021-12-15T00:00:00.000', 'count': '1'},
    {'item': '2020-07-10T00:00:00.000', 'count': '1'},
    {'item': '2021-09-11T00:00:00.000', 'count': '1'},
    {'item': '2020-08-11T00:00:00.000', 'count': '1'},
    {'item': '2020-04-19T00:00:00.000', 'count': '1'},
    {'item': '2023-11-19T00:00:00.000', 'count': '1'}],
   'smallest': '2019-12-01T00:00:00.000',
   'count': '1520',
   'cardinality': '1520'},
  'format': {'view': 'date'}},
 {'id': 542388893,
  'name': 'New Cases',
  'dataTypeName': 'number',
  'description': 'Total number of new cases reported by date created in CalREDIE. ',
  'fieldName': 'bklhj_newcases',
  'position': 2,
  'renderTypeName': 'number',
  'tableColumnId': 98765830,
  'cachedContents': {'non_null': '1520',
   'largest': '325',
   'null': '0',
   'top': [{'item': '0', 'count': '150'},
    {'item': '2', 'count': '111'},
    {'item': '4', 'count': '103'},
    {'item': '1', 'count': '102'},
    {'item': '3', 'count': '100'},
    {'item': '5', 'count': '82'},
    {'item': '6', 'count': '81'},
    {'item': '7', 'count': '63'},
    {'item': '8', 'count': '51'},
    {'item': '9', 'count': '50'},
    {'item': '11', 'count': '43'},
    {'item': '10', 'count': '38'},
    {'item': '12', 'count': '36'},
    {'item': '13', 'count': '36'},
    {'item': '16', 'count': '25'},
    {'item': '17', 'count': '23'},
    {'item': '15', 'count': '23'},
    {'item': '19', 'count': '22'},
    {'item': '18', 'count': '21'},
    {'item': '14', 'count': '21'}],
   'smallest': '0',
   'count': '1520',
   'cardinality': '115'},
  'format': {}},
 {'id': 542388894,
  'name': 'Cumulative Cases',
  'dataTypeName': 'number',
  'description': 'Total number of cumulative cases reported by date created in CalREDIE. ',
  'fieldName': 'bklhj_cumulcases',
  'position': 3,
  'renderTypeName': 'number',
  'tableColumnId': 98765829,
  'cachedContents': {'non_null': '1520',
   'largest': '24677',
   'null': '0',
   'top': [{'item': '0', 'count': '35'},
    {'item': '3', 'count': '21'},
    {'item': '6', 'count': '15'},
    {'item': '5', 'count': '8'},
    {'item': '2', 'count': '6'},
    {'item': '7', 'count': '6'},
    {'item': '3609', 'count': '5'},
    {'item': '68', 'count': '4'},
    {'item': '58', 'count': '4'},
    {'item': '9', 'count': '4'},
    {'item': '8', 'count': '4'},
    {'item': '24677', 'count': '4'},
    {'item': '53', 'count': '4'},
    {'item': '3605', 'count': '3'},
    {'item': '82', 'count': '3'},
    {'item': '35', 'count': '3'},
    {'item': '3621', 'count': '3'},
    {'item': '80', 'count': '3'},
    {'item': '3612', 'count': '3'},
    {'item': '100', 'count': '3'}],
   'smallest': '0',
   'count': '1520',
   'cardinality': '1371'},
  'format': {}}]


from IPython.display import JSON
JSON(covid_json)

<IPython.core.display.JSON object>


# pd.read_json(covid_file)


# Load the data from JSON and assign column titles
covid = pd.DataFrame(
    covid_json['data'],
    columns=[c['name'] for c in covid_json['meta']['view']['columns']])

covid.tail()


calls = pd.read_csv("data/Berkeley_PD_-_Calls_for_Service.csv")
calls.head()


calls["EVENTDT"] = pd.to_datetime(calls["EVENTDT"],
                                 format = "%m/%d/%Y %I:%M:%S %p")
calls.head()


calls["EVENTDT"].dt.month

0        4
1        4
2        4
3        2
4        2
        ..
2627    12
2628     2
2629     3
2630     4
2631     2
Name: EVENTDT, Length: 2632, dtype: int32


calls["EVENTDT"].dt.dayofweek

0       3
1       3
2       0
3       5
4       0
       ..
2627    0
2628    2
2629    2
2630    5
2631    4
Name: EVENTDT, Length: 2632, dtype: int32


calls.sort_values("EVENTDT").head()

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	21014296	THEFT MISD. (UNDER $950)	04/01/2021 12:00:00 AM	10:58	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
1	21014391	THEFT MISD. (UNDER $950)	04/01/2021 12:00:00 AM	10:38	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
2	21090494	THEFT MISD. (UNDER $950)	04/19/2021 12:00:00 AM	12:15	LARCENY	1	06/15/2021 12:00:00 AM	2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...	2100 BLOCK HASTE ST	Berkeley	CA
3	21090204	THEFT FELONY (OVER $950)	02/13/2021 12:00:00 AM	17:00	LARCENY	6	06/15/2021 12:00:00 AM	2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...	2600 BLOCK WARRING ST	Berkeley	CA
4	21090179	BURGLARY AUTO	02/08/2021 12:00:00 AM	6:20	BURGLARY - VEHICLE	1	06/15/2021 12:00:00 AM	2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...	2700 BLOCK GARBER ST	Berkeley	CA

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	21014296	THEFT MISD. (UNDER $950)	2021-04-01	10:58	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
1	21014391	THEFT MISD. (UNDER $950)	2021-04-01	10:38	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
2	21090494	THEFT MISD. (UNDER $950)	2021-04-19	12:15	LARCENY	1	06/15/2021 12:00:00 AM	2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...	2100 BLOCK HASTE ST	Berkeley	CA
3	21090204	THEFT FELONY (OVER $950)	2021-02-13	17:00	LARCENY	6	06/15/2021 12:00:00 AM	2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...	2600 BLOCK WARRING ST	Berkeley	CA
4	21090179	BURGLARY AUTO	2021-02-08	6:20	BURGLARY - VEHICLE	1	06/15/2021 12:00:00 AM	2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...	2700 BLOCK GARBER ST	Berkeley	CA

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
2513	20057398	BURGLARY COMMERCIAL	2020-12-17	16:05	BURGLARY - COMMERCIAL	4	06/15/2021 12:00:00 AM	600 BLOCK GILMAN ST\nBerkeley, CA\n(37.878405,...	600 BLOCK GILMAN ST	Berkeley	CA
624	20057207	ASSAULT/BATTERY MISD.	2020-12-17	16:50	ASSAULT	4	06/15/2021 12:00:00 AM	2100 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.871...	2100 BLOCK SHATTUCK AVE	Berkeley	CA
154	20092214	THEFT FROM AUTO	2020-12-17	18:30	LARCENY - FROM VEHICLE	4	06/15/2021 12:00:00 AM	800 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.8918...	800 BLOCK SHATTUCK AVE	Berkeley	CA
659	20057324	THEFT MISD. (UNDER $950)	2020-12-17	15:44	LARCENY	4	06/15/2021 12:00:00 AM	1800 BLOCK 4TH ST\nBerkeley, CA\n(37.869888, -...	1800 BLOCK 4TH ST	Berkeley	CA
993	20057573	BURGLARY RESIDENTIAL	2020-12-17	22:15	BURGLARY - RESIDENTIAL	4	06/15/2021 12:00:00 AM	1700 BLOCK STUART ST\nBerkeley, CA\n(37.857495...	1700 BLOCK STUART ST	Berkeley	CA

Lecture 5 (Part 2 Examples) – Data 100, Spring 2024¶

Structure: Other File Formats¶

JSON¶

File size¶

File contents¶

EDA: Digging into JSON¶

Examine what keys are in the top level json object¶

Examining the Data Field for Records¶

Columns Metadata¶

Summary of exploring the JSON file¶

JSON with pandas¶

Temporality¶

	Unnamed: 0	No. of TB cases	Unnamed: 2	Unnamed: 3	TB incidence	Unnamed: 5	Unnamed: 6
0	U.S. jurisdiction	2019	2020	2021	2019.00	2020.00	2021.00
1	Total	8,900	7,173	7,860	2.71	2.16	2.37
2	Alabama	87	72	92	1.77	1.43	1.83
3	Alaska	58	58	58	7.91	7.92	7.92
4	Arizona	183	136	129	2.51	1.89	1.77

	sid	id	created_at	created_meta	updated_at	updated_meta	meta	Date	New Cases	Cumulative Cases
1515	row-wtag_7ui7_ugvx	00000000-0000-0000-8CFB-0570B340211A	1706547174	None	1706547174	None	{ }	2024-01-24T00:00:00	1	24675
1516	row-kaag.gtts.uxmj	00000000-0000-0000-FAF3-FA09CAA6895B	1706547174	None	1706547174	None	{ }	2024-01-25T00:00:00	2	24677
1517	row-qua2_nqsi-pq44	00000000-0000-0000-578E-F61FD9B3EF33	1706547174	None	1706547174	None	{ }	2024-01-26T00:00:00	0	24677
1518	row-7t55~tn39-a8ny	00000000-0000-0000-51B8-53476D830328	1706547174	None	1706547174	None	{ }	2024-01-27T00:00:00	0	24677
1519	row-9m4f_f28k~sxr3	00000000-0000-0000-767B-A43C55856468	1706547174	None	1706547174	None	{ }	2024-01-28T00:00:00	0	24677