import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
# Use 2 decimal places instead of scientific notation in pandas
pd.set_option('display.float_format', '{:.2f}'.format)

# Silence some spurious seaborn warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Identical data to Part 1!
tuberculosis_df_tsv = pd.read_csv("data/cdc_tuberculosis.tsv", sep='\t')
tuberculosis_df_tsv.head()

# CSV representation
with open("data/cdc_tuberculosis.csv", "r") as f:
    for _, row in zip(range(4), f):
        print(row) # print raw strings

,No. of TB cases,,

U.S. jurisdiction,2019,2020,2021

Total,"8,900","7,173","7,860"

Alabama,87,72,92

# TSV representation
with open("data/cdc_tuberculosis.tsv", "r") as f:
    for _, row in zip(range(4), f):
        print(row) # print raw strings

	No. of TB cases		

U.S. jurisdiction	2019	2020	2021

Total	"8,900"	"7,173"	"7,860"

Alabama	87	72	92

# Print literal \t instead of tabbed spaces:
with open("data/cdc_tuberculosis.tsv", "r") as f:
    for _, row in zip(range(4), f):
        print(repr(row)) # print raw strings

'\tNo. of TB cases\t\t\n'
'U.S. jurisdiction\t2019\t2020\t2021\n'
'Total\t"8,900"\t"7,173"\t"7,860"\n'
'Alabama\t87\t72\t92\n'

congress_file = "data/ca-congress-members.json"

# Inspect the first five lines of the file
with open(congress_file, "r") as f:
    for i, row in enumerate(f):
        print(row)
        if i >= 4: break

{

    "members": [

        {

            "bioguideId": "T000491",

            "depiction": {

import json

# Import the JSON file into Python as a dictionary
with open(congress_file, "rb") as f:
    congress_json = json.load(f)

type(congress_json)

dict

# Grab the list corresponding to the `members` key in the JSON dictionary, 
# and then grab the first element of this list.
# In a moment, we'll see how we knew to use the key `members`, and that
# the resulting object is a list.
congress_json['members'][0]

{'bioguideId': 'T000491',
 'depiction': {'attribution': 'Image courtesy of the Member',
  'imageUrl': 'https://www.congress.gov/img/member/6774606d0b34857ecc9091a9_200.jpg'},
 'district': 45,
 'name': 'Tran, Derek',
 'partyName': 'Democratic',
 'state': 'California',
 'terms': {'item': [{'chamber': 'House of Representatives',
    'startYear': 2025}]},
 'updateDate': '2025-01-21T18:00:52Z',
 'url': 'https://api.congress.gov/v3/member/T000491?format=json'}

# Grab the top-level keys of the JSON dictionary
congress_json.keys()

dict_keys(['members', 'pagination', 'request'])

type(congress_json['members'])

list

congress_json['members'][:2]

[{'bioguideId': 'T000491',
  'depiction': {'attribution': 'Image courtesy of the Member',
   'imageUrl': 'https://www.congress.gov/img/member/6774606d0b34857ecc9091a9_200.jpg'},
  'district': 45,
  'name': 'Tran, Derek',
  'partyName': 'Democratic',
  'state': 'California',
  'terms': {'item': [{'chamber': 'House of Representatives',
     'startYear': 2025}]},
  'updateDate': '2025-01-21T18:00:52Z',
  'url': 'https://api.congress.gov/v3/member/T000491?format=json'},
 {'bioguideId': 'M001241',
  'depiction': {'attribution': 'Image courtesy of the Member',
   'imageUrl': 'https://www.congress.gov/img/member/67744ed90b34857ecc909155_200.jpg'},
  'district': 47,
  'name': 'Min, Dave',
  'partyName': 'Democratic',
  'state': 'California',
  'terms': {'item': [{'chamber': 'House of Representatives',
     'startYear': 2025}]},
  'updateDate': '2025-01-21T18:00:52Z',
  'url': 'https://api.congress.gov/v3/member/M001241?format=json'}]

print(congress_json['pagination'])
print(congress_json['request'])

{'count': 54}
{'contentType': 'application/json', 'format': 'json'}

# pd.read_json(congress_file)

# Convert dictionary to DataFrame
congress_df = pd.DataFrame(congress_json['members'])
congress_df.head()

calls = pd.read_csv("data/Berkeley_PD_-_Calls_for_Service.csv")
calls.head()

# pd.to_datetime() is smart -- It can often infer what you want based on
# the format of the datetime string.
# But, it's not always right! It's good practice to specify the format of 
# your datetimes. See the documentation and the `format` argument.

# Without format specified:
calls["EVENTDT"] = pd.to_datetime(calls["EVENTDT"])

# With format specified:
# calls["EVENTDT"] = pd.to_datetime(calls["EVENTDT"], format='%m/%d/%Y %I:%M:%S %p')

calls.head()

/tmp/ipykernel_99/2275392033.py:7: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  calls["EVENTDT"] = pd.to_datetime(calls["EVENTDT"])

# 1 - January, 2 - February, ..., 12 - December
calls["EVENTDT"].dt.month

0        4
1        4
2        4
3        2
4        2
        ..
2627    12
2628     2
2629     3
2630     4
2631     2
Name: EVENTDT, Length: 2632, dtype: int32

# 0 - Monday, 1 - Tuesday, ..., 6 - Sunday
calls["EVENTDT"].dt.dayofweek

0       3
1       3
2       0
3       5
4       0
       ..
2627    0
2628    2
2629    2
2630    5
2631    4
Name: EVENTDT, Length: 2632, dtype: int32

# Sort the DataFrame by datetime to find the earliest call.
calls.sort_values("EVENTDT").head()

calls["EVENTDT"].dtype

dtype('<M8[ns]')

# datetimes in pandas are stored as integers representing number of 
# nanoseconds since 1970-01-01
calls["EVENTDT"].astype(int)

0       1617235200000000000
1       1617235200000000000
2       1618790400000000000
3       1613174400000000000
4       1612742400000000000
               ...         
2627    1608508800000000000
2628    1614124800000000000
2629    1616544000000000000
2630    1619222400000000000
2631    1614297600000000000
Name: EVENTDT, Length: 2632, dtype: int64

calls.head()

# isna() returns a Series of booleans indicating whether each element 
# # in the Series is missing.
print(calls['BLKADDR'].isna().head())

# The mean of a Series of booleans is the proportion of booleans that are True.
calls['BLKADDR'].isna().mean()

0     True
1     True
2    False
3    False
4    False
Name: BLKADDR, dtype: bool

0.007598784194528876

# You can see the total number of rows at the top of the .info() output.
# Compare this to the number of non-null values in the BLKADDR column.
calls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2632 entries, 0 to 2631
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   CASENO          2632 non-null   int64         
 1   OFFENSE         2632 non-null   object        
 2   EVENTDT         2632 non-null   datetime64[ns]
 3   EVENTTM         2632 non-null   object        
 4   CVLEGEND        2632 non-null   object        
 5   CVDOW           2632 non-null   int64         
 6   InDbDate        2632 non-null   object        
 7   Block_Location  2632 non-null   object        
 8   BLKADDR         2612 non-null   object        
 9   City            2632 non-null   object        
 10  State           2632 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 226.3+ KB

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	21014296	THEFT MISD. (UNDER $950)	04/01/2021 12:00:00 AM	10:58	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
1	21014391	THEFT MISD. (UNDER $950)	04/01/2021 12:00:00 AM	10:38	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
2	21090494	THEFT MISD. (UNDER $950)	04/19/2021 12:00:00 AM	12:15	LARCENY	1	06/15/2021 12:00:00 AM	2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...	2100 BLOCK HASTE ST	Berkeley	CA
3	21090204	THEFT FELONY (OVER $950)	02/13/2021 12:00:00 AM	17:00	LARCENY	6	06/15/2021 12:00:00 AM	2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...	2600 BLOCK WARRING ST	Berkeley	CA
4	21090179	BURGLARY AUTO	02/08/2021 12:00:00 AM	6:20	BURGLARY - VEHICLE	1	06/15/2021 12:00:00 AM	2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...	2700 BLOCK GARBER ST	Berkeley	CA

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	21014296	THEFT MISD. (UNDER $950)	2021-04-01	10:58	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
1	21014391	THEFT MISD. (UNDER $950)	2021-04-01	10:38	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
2	21090494	THEFT MISD. (UNDER $950)	2021-04-19	12:15	LARCENY	1	06/15/2021 12:00:00 AM	2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...	2100 BLOCK HASTE ST	Berkeley	CA
3	21090204	THEFT FELONY (OVER $950)	2021-02-13	17:00	LARCENY	6	06/15/2021 12:00:00 AM	2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...	2600 BLOCK WARRING ST	Berkeley	CA
4	21090179	BURGLARY AUTO	2021-02-08	6:20	BURGLARY - VEHICLE	1	06/15/2021 12:00:00 AM	2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...	2700 BLOCK GARBER ST	Berkeley	CA

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
2513	20057398	BURGLARY COMMERCIAL	2020-12-17	16:05	BURGLARY - COMMERCIAL	4	06/15/2021 12:00:00 AM	600 BLOCK GILMAN ST\nBerkeley, CA\n(37.878405,...	600 BLOCK GILMAN ST	Berkeley	CA
624	20057207	ASSAULT/BATTERY MISD.	2020-12-17	16:50	ASSAULT	4	06/15/2021 12:00:00 AM	2100 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.871...	2100 BLOCK SHATTUCK AVE	Berkeley	CA
154	20092214	THEFT FROM AUTO	2020-12-17	18:30	LARCENY - FROM VEHICLE	4	06/15/2021 12:00:00 AM	800 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.8918...	800 BLOCK SHATTUCK AVE	Berkeley	CA
659	20057324	THEFT MISD. (UNDER $950)	2020-12-17	15:44	LARCENY	4	06/15/2021 12:00:00 AM	1800 BLOCK 4TH ST\nBerkeley, CA\n(37.869888, -...	1800 BLOCK 4TH ST	Berkeley	CA
993	20057573	BURGLARY RESIDENTIAL	2020-12-17	22:15	BURGLARY - RESIDENTIAL	4	06/15/2021 12:00:00 AM	1700 BLOCK STUART ST\nBerkeley, CA\n(37.857495...	1700 BLOCK STUART ST	Berkeley	CA

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	21014296	THEFT MISD. (UNDER $950)	2021-04-01	10:58	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
1	21014391	THEFT MISD. (UNDER $950)	2021-04-01	10:38	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
2	21090494	THEFT MISD. (UNDER $950)	2021-04-19	12:15	LARCENY	1	06/15/2021 12:00:00 AM	2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...	2100 BLOCK HASTE ST	Berkeley	CA
3	21090204	THEFT FELONY (OVER $950)	2021-02-13	17:00	LARCENY	6	06/15/2021 12:00:00 AM	2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...	2600 BLOCK WARRING ST	Berkeley	CA
4	21090179	BURGLARY AUTO	2021-02-08	6:20	BURGLARY - VEHICLE	1	06/15/2021 12:00:00 AM	2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...	2700 BLOCK GARBER ST	Berkeley	CA

💽 Lecture 5 (Part 2, Data Storage Types) – Data 100, Spring 2025¶

🤹‍♀️ File Formats Other than CSV¶

🪆 JSON (JavaScript Object Notation)¶

📁 File contents¶

🐍 EDA: Digging into JSON with Python¶

🐼 JSON with pandas¶

🕰️ Temporality¶

🤷 Faithfulness and missing values¶

	Unnamed: 0	No. of TB cases	Unnamed: 2	Unnamed: 3
0	U.S. jurisdiction	2019	2020	2021
1	Total	8,900	7,173	7,860
2	Alabama	87	72	92
3	Alaska	58	58	58
4	Arizona	183	136	129

	bioguideId	depiction	district	name	partyName	state	terms	updateDate	url
0	T000491	{'attribution': 'Image courtesy of the Member'...	45.00	Tran, Derek	Democratic	California	{'item': [{'chamber': 'House of Representative...	2025-01-21T18:00:52Z	https://api.congress.gov/v3/member/T000491?for...
1	M001241	{'attribution': 'Image courtesy of the Member'...	47.00	Min, Dave	Democratic	California	{'item': [{'chamber': 'House of Representative...	2025-01-21T18:00:52Z	https://api.congress.gov/v3/member/M001241?for...
2	K000400	{'attribution': 'Image courtesy of the Member'...	37.00	Kamlager-Dove, Sydney	Democratic	California	{'item': [{'chamber': 'House of Representative...	2025-01-21T18:00:52Z	https://api.congress.gov/v3/member/K000400?for...
3	G000598	{'attribution': 'Image courtesy of the Member'...	42.00	Garcia, Robert	Democratic	California	{'item': [{'chamber': 'House of Representative...	2025-01-21T18:00:52Z	https://api.congress.gov/v3/member/G000598?for...
4	K000397	{'attribution': 'Image courtesy of the Member'...	40.00	Kim, Young	Republican	California	{'item': [{'chamber': 'House of Representative...	2025-01-21T18:00:52Z	https://api.congress.gov/v3/member/K000397?for...