import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
# This option stops scientific notation for pandas
pd.set_option('display.float_format', '{:.2f}'.format)

# Silence some spurious seaborn warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

rename_dict = {'2019': 'TB cases 2019',
               '2020': 'TB cases 2020',
               '2021': 'TB cases 2021',
               '2019.1': 'TB incidence 2019',
               '2020.1': 'TB incidence 2020',
               '2021.1': 'TB incidence 2021'}

tb_df = (
    pd.read_csv("data/cdc_tuberculosis.csv", header=1, thousands=',')
    .rename(columns=rename_dict)
)
tb_df = tb_df[1:] #Get rid of the first summary row
tb_df.head()

# 2010s census data
census_2010s_df = pd.read_csv("data/nst-est2019-01.csv", header=3, thousands=",")
census_2010s_df = (
    census_2010s_df
    .reset_index()
    .drop(columns=["index", "Census", "Estimates Base"])
    .rename(columns={"Unnamed: 0": "Geographic Area"})
    .convert_dtypes()                 # "smart" converting of columns, use at your own risk
    .dropna()                         # we'll introduce this very soon
)
census_2010s_df['Geographic Area'] = census_2010s_df['Geographic Area'].str.strip('.')

# with pd.option_context('display.min_rows', 30): # shows more rows
#     display(census_2010s_df)
    
census_2010s_df.head(10)

# census 2020s data
census_2020s_df = pd.read_csv("data/NST-EST2022-POP.csv", header=3, thousands=",")
census_2020s_df = (
    census_2020s_df
    .reset_index()
    .drop(columns=["index", "Unnamed: 1"])
    .rename(columns={"Unnamed: 0": "Geographic Area"})
    .convert_dtypes()    # "smart" converting of columns, use at your own risk
    .dropna()            # we'll introduce this next time
)
census_2020s_df['Geographic Area'] = census_2020s_df['Geographic Area'].str.strip('.')

census_2020s_df

# merge TB dataframe with two US census dataframes
tb_census_df = (
    tb_df
    .merge(right=census_2010s_df,
           left_on="U.S. jurisdiction", right_on="Geographic Area")
    .merge(right=census_2020s_df,
           left_on="U.S. jurisdiction", right_on="Geographic Area")
)
tb_census_df

# try merging again, but cleaner this time
tb_census_df = (
    tb_df
    .merge(right=census_2010s_df[["Geographic Area", "2019"]],
           left_on="U.S. jurisdiction", right_on="Geographic Area")
    .drop(columns="Geographic Area")
    .merge(right=census_2020s_df[["Geographic Area", "2020", "2021"]],
           left_on="U.S. jurisdiction", right_on="Geographic Area")
    .drop(columns="Geographic Area")
)
tb_census_df

tb_census_df["recompute incidence 2019"] = tb_census_df["TB cases 2019"]/tb_census_df["2019"]*100000
tb_census_df

# recompute incidence for all years
for year in [2019, 2020, 2021]:
    tb_census_df[f"recompute incidence {year}"] = tb_census_df[f"TB cases {year}"]/tb_census_df[f"{year}"]*100000
tb_census_df

with open("data/cdc_tuberculosis.tsv", "r") as f:
    for i, row in enumerate(f):
        print(repr(row)) # print raw strings
        if i >= 3: break

'\tNo. of TB cases\t\t\tTB incidence\t\t\n'
'U.S. jurisdiction\t2019\t2020\t2021\t2019\t2020\t2021\n'
'Total\t"8,900"\t"7,173"\t"7,860"\t2.71\t2.16\t2.37\n'
'Alabama\t87\t72\t92\t1.77\t1.43\t1.83\n'

with open("data/cdc_tuberculosis.tsv", "r") as f:
    for row in f.readlines()[:4]:
        print(repr(row)) # print raw strings

'\tNo. of TB cases\t\t\tTB incidence\t\t\n'
'U.S. jurisdiction\t2019\t2020\t2021\t2019\t2020\t2021\n'
'Total\t"8,900"\t"7,173"\t"7,860"\t2.71\t2.16\t2.37\n'
'Alabama\t87\t72\t92\t1.77\t1.43\t1.83\n'

with open("data/cdc_tuberculosis.tsv", "r") as f:
    for _, row in zip(range(4), f):
        print(repr(row)) # print raw strings

'\tNo. of TB cases\t\t\tTB incidence\t\t\n'
'U.S. jurisdiction\t2019\t2020\t2021\t2019\t2020\t2021\n'
'Total\t"8,900"\t"7,173"\t"7,860"\t2.71\t2.16\t2.37\n'
'Alabama\t87\t72\t92\t1.77\t1.43\t1.83\n'

tuberculosis_df_tsv = pd.read_csv("data/cdc_tuberculosis.tsv", sep='\t')
tuberculosis_df_tsv.head()

# just run this cell
from ds100_utils import fetch_and_cache

from importlib import reload
reload(utils)

covid_file = fetch_and_cache(
    "https://data.cityofberkeley.info/api/views/xn6j-b766/rows.json?accessType=DOWNLOAD",
    "confirmed-cases.json",
    force=False)
covid_file          # a file path wrapper object

Using cached version that was downloaded (UTC): Mon Aug 28 16:48:43 2023

PosixPath('data/confirmed-cases.json')

import os

print(covid_file, "is", os.path.getsize(covid_file) / 1e6, "MB")

with open(covid_file, "r") as f:
    print(covid_file, "is", sum(1 for l in f), "lines.")

data/confirmed-cases.json is 0.205583 MB
data/confirmed-cases.json is 1707 lines.

!ls -lh {covid_file}
!wc -l {covid_file}

-rw-r--r--  1 fperez  staff   201K Aug 28 16:48 data/confirmed-cases.json
    1706 data/confirmed-cases.json

with open(covid_file, "r") as f:
    for i, row in enumerate(f):
        print(repr(row)) # print raw strings
        if i >= 4: break

'{\n'
'  "meta" : {\n'
'    "view" : {\n'
'      "id" : "xn6j-b766",\n'
'      "name" : "COVID-19 Confirmed Cases",\n'

!head -5 {covid_file}

{
  "meta" : {
    "view" : {
      "id" : "xn6j-b766",
      "name" : "COVID-19 Confirmed Cases",

import json

with open(covid_file, "rb") as f:
    covid_json = json.load(f)

type(covid_json)

dict

covid_json.keys()

dict_keys(['meta', 'data'])

covid_json['meta'].keys()

dict_keys(['view'])

covid_json['meta']['view'].keys()

dict_keys(['id', 'name', 'assetType', 'attribution', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'clientContext', 'columns', 'grants', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])

print(covid_json['meta']['view']['description'])

Counts of confirmed COVID-19 cases among Berkeley residents by date. As of 6/21/22, this dataset will be updated weekly instead of daily. As of 11/14/22, this dataset only includes PCR cases.

for i in range(3):
    print(f"{i:03} | {covid_json['data'][i]}")

000 | ['row-yeww~gvnm.g68w', '00000000-0000-0000-386C-533081785A3A', 0, 1687370769, None, 1687370769, None, '{ }', '2019-12-01T00:00:00', '0', '0']
001 | ['row-fsx2.86py_bzpc', '00000000-0000-0000-FF2C-E5BC42CD0EE9', 0, 1687370769, None, 1687370769, None, '{ }', '2019-12-02T00:00:00', '0', '0']
002 | ['row-ggah~5w7b~ce8n', '00000000-0000-0000-6EFB-E68400EBBD1E', 0, 1687370769, None, 1687370769, None, '{ }', '2019-12-03T00:00:00', '0', '0']

type(covid_json['meta']['view']['columns'])

list

from IPython.display import JSON
JSON(covid_json)

<IPython.core.display.JSON object>

# Load the data from JSON and assign column titles
covid = pd.DataFrame(
    covid_json['data'],
    columns=[c['name'] for c in covid_json['meta']['view']['columns']])

covid.tail()

calls = pd.read_csv("data/Berkeley_PD_-_Calls_for_Service.csv")
calls.head()

calls["EVENTDT"] = pd.to_datetime(calls["EVENTDT"])
calls.head()

/var/folders/j1/n8kn9ftd7257n2rvkkzlj3mc0010dw/T/ipykernel_58311/874729699.py:1: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
  calls["EVENTDT"] = pd.to_datetime(calls["EVENTDT"])

calls["EVENTDT"].dt.month

0        4
1        4
2        4
3        2
4        2
        ..
2627    12
2628     2
2629     3
2630     4
2631     2
Name: EVENTDT, Length: 2632, dtype: int32

calls["EVENTDT"].dt.dayofweek

0       3
1       3
2       0
3       5
4       0
       ..
2627    0
2628    2
2629    2
2630    5
2631    4
Name: EVENTDT, Length: 2632, dtype: int32

calls.sort_values("EVENTDT").head()

co2_file = "data/co2_mm_mlo.txt"

co2 = pd.read_csv(
    co2_file, header = None, skiprows = 72,
    sep = r'\s+'       #delimiter for continuous whitespace (stay tuned for regex next lecture))
)
co2.head()

co2 = pd.read_csv(
    co2_file, header = None, skiprows = 72,
    sep = '\s+', #regex for continuous whitespace (next lecture)
    names = ['Yr', 'Mo', 'DecDate', 'Avg', 'Int', 'Trend', 'Days']
)
co2.head()

sns.lineplot(x='DecDate', y='Avg', data=co2);

co2.head()

co2.tail()

co2.shape

(738, 7)

co2["Mo"].value_counts().sort_index()

Mo
1     61
2     61
3     62
4     62
5     62
6     62
7     62
8     62
9     61
10    61
11    61
12    61
Name: count, dtype: int64

sns.displot(co2['Days']);
plt.title("Distribution of days feature"); # suppresses unneeded plotting output

/Users/fperez/local/conda/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

sns.scatterplot(x="Yr", y="Days", data=co2);
plt.title("Day field by Year"); # the ; suppresses output

# Histograms of average CO2 measurements
sns.displot(co2['Avg']);

/Users/fperez/local/conda/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

co2[co2["Avg"] < 0]

sns.lineplot(x='DecDate', y='Avg', data=co2)
plt.title("CO2 Average By Month");

# 1. Drop missing values
co2_drop = co2[co2['Avg'] > 0]

# 2. Replace NaN with -99.99
co2_NA = co2.replace(-99.99, np.NaN)

# 3. Use interpolated column which estimates missing Avg values
co2_impute = co2.copy()
co2_impute['Avg'] = co2['Int']

# results of plotting data in 1958

def line_and_points(data, ax, title):
    # assumes single year, hence Mo
    ax.plot('Mo', 'Avg', data=data)
    ax.scatter('Mo', 'Avg', data=data)
    ax.set_xlim(2, 13)
    ax.set_title(title)
    ax.set_xticks(np.arange(3, 13))

def data_year(data, year):
    return data[data["Yr"] == 1958]
    
# uses matplotlib subplots
# you may see more next week; focus on output for now
fig, axes = plt.subplots(ncols = 3, figsize=(12, 4), sharey=True)

year = 1958
line_and_points(data_year(co2_drop, year), axes[0], title="1. Drop Missing")
line_and_points(data_year(co2_NA, year), axes[1], title="2. Missing Set to NaN")
line_and_points(data_year(co2_impute, year), axes[2], title="3. Missing Interpolated")

fig.suptitle(f"Monthly Averages for {year}")
plt.tight_layout()

sns.lineplot(x='DecDate', y='Avg', data=co2_impute)
plt.title("CO2 Average By Month, Imputed");

co2_year = co2_impute.groupby('Yr').mean()
sns.lineplot(x='Yr', y='Avg', data=co2_year)
plt.title("CO2 Average By Year");

	Geographic Area	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019
0	United States	309321666	311556874	313830990	315993715	318301008	320635163	322941311	324985539	326687501	328239523
1	Northeast	55380134	55604223	55775216	55901806	56006011	56034684	56042330	56059240	56046620	55982803
2	Midwest	66974416	67157800	67336743	67560379	67745167	67860583	67987540	68126781	68236628	68329004
3	South	114866680	116006522	117241208	118364400	119624037	120997341	122351760	123542189	124569433	125580448
4	West	72100436	72788329	73477823	74167130	74925793	75742555	76559681	77257329	77834820	78347268
5	Alabama	4785437	4799069	4815588	4830081	4841799	4852347	4863525	4874486	4887681	4903185
6	Alaska	713910	722128	730443	737068	736283	737498	741456	739700	735139	731545
7	Arizona	6407172	6472643	6554978	6632764	6730413	6829676	6941072	7044008	7158024	7278717
8	Arkansas	2921964	2940667	2952164	2959400	2967392	2978048	2989918	3001345	3009733	3017804
9	California	37319502	37638369	37948800	38260787	38596972	38918045	39167117	39358497	39461588	39512223

	Geographic Area	2020	2021	2022
0	United States	331511512	332031554	333287557
1	Northeast	57448898	57259257	57040406
2	Midwest	68961043	68836505	68787595
3	South	126450613	127346029	128716192
4	West	78650958	78589763	78743364
...	...	...	...	...
52	Washington	7724031	7740745	7785786
53	West Virginia	1791420	1785526	1775156
54	Wisconsin	5896271	5880101	5892539
55	Wyoming	577605	579483	581381
57	Puerto Rico	3281557	3262693	3221789

	U.S. jurisdiction	TB cases 2019	TB cases 2020	TB cases 2021	TB incidence 2019	TB incidence 2020	TB incidence 2021	Geographic Area_x	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019	Geographic Area_y	2020	2021	2022
0	Alabama	87	72	92	1.77	1.43	1.83	Alabama	4785437	4799069	4815588	4830081	4841799	4852347	4863525	4874486	4887681	4903185	Alabama	5031362	5049846	5074296
1	Alaska	58	58	58	7.91	7.92	7.92	Alaska	713910	722128	730443	737068	736283	737498	741456	739700	735139	731545	Alaska	732923	734182	733583
2	Arizona	183	136	129	2.51	1.89	1.77	Arizona	6407172	6472643	6554978	6632764	6730413	6829676	6941072	7044008	7158024	7278717	Arizona	7179943	7264877	7359197
3	Arkansas	64	59	69	2.12	1.96	2.28	Arkansas	2921964	2940667	2952164	2959400	2967392	2978048	2989918	3001345	3009733	3017804	Arkansas	3014195	3028122	3045637
4	California	2111	1706	1750	5.35	4.32	4.46	California	37319502	37638369	37948800	38260787	38596972	38918045	39167117	39358497	39461588	39512223	California	39501653	39142991	39029342
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
46	Virginia	191	169	161	2.23	1.96	1.86	Virginia	8023699	8101155	8185080	8252427	8310993	8361808	8410106	8463587	8501286	8535519	Virginia	8636471	8657365	8683619
47	Washington	221	163	199	2.90	2.11	2.57	Washington	6742830	6826627	6897058	6963985	7054655	7163657	7294771	7423362	7523869	7614893	Washington	7724031	7740745	7785786
48	West Virginia	9	13	7	0.50	0.73	0.39	West Virginia	1854239	1856301	1856872	1853914	1849489	1842050	1831023	1817004	1804291	1792147	West Virginia	1791420	1785526	1775156
49	Wisconsin	51	35	66	0.88	0.59	1.12	Wisconsin	5690475	5705288	5719960	5736754	5751525	5760940	5772628	5790186	5807406	5822434	Wisconsin	5896271	5880101	5892539
50	Wyoming	1	0	3	0.17	0.00	0.52	Wyoming	564487	567299	576305	582122	582531	585613	584215	578931	577601	578759	Wyoming	577605	579483	581381

	U.S. jurisdiction	TB cases 2019	TB cases 2020	TB cases 2021	TB incidence 2019	TB incidence 2020	TB incidence 2021	2019	2020	2021
0	Alabama	87	72	92	1.77	1.43	1.83	4903185	5031362	5049846
1	Alaska	58	58	58	7.91	7.92	7.92	731545	732923	734182
2	Arizona	183	136	129	2.51	1.89	1.77	7278717	7179943	7264877
3	Arkansas	64	59	69	2.12	1.96	2.28	3017804	3014195	3028122
4	California	2111	1706	1750	5.35	4.32	4.46	39512223	39501653	39142991
...	...	...	...	...	...	...	...	...	...	...
46	Virginia	191	169	161	2.23	1.96	1.86	8535519	8636471	8657365
47	Washington	221	163	199	2.90	2.11	2.57	7614893	7724031	7740745
48	West Virginia	9	13	7	0.50	0.73	0.39	1792147	1791420	1785526
49	Wisconsin	51	35	66	0.88	0.59	1.12	5822434	5896271	5880101
50	Wyoming	1	0	3	0.17	0.00	0.52	578759	577605	579483

	U.S. jurisdiction	TB cases 2019	TB cases 2020	TB cases 2021	TB incidence 2019	TB incidence 2020	TB incidence 2021	2019	2020	2021	recompute incidence 2019
0	Alabama	87	72	92	1.77	1.43	1.83	4903185	5031362	5049846	1.77
1	Alaska	58	58	58	7.91	7.92	7.92	731545	732923	734182	7.93
2	Arizona	183	136	129	2.51	1.89	1.77	7278717	7179943	7264877	2.51
3	Arkansas	64	59	69	2.12	1.96	2.28	3017804	3014195	3028122	2.12
4	California	2111	1706	1750	5.35	4.32	4.46	39512223	39501653	39142991	5.34
...	...	...	...	...	...	...	...	...	...	...	...
46	Virginia	191	169	161	2.23	1.96	1.86	8535519	8636471	8657365	2.24
47	Washington	221	163	199	2.90	2.11	2.57	7614893	7724031	7740745	2.90
48	West Virginia	9	13	7	0.50	0.73	0.39	1792147	1791420	1785526	0.50
49	Wisconsin	51	35	66	0.88	0.59	1.12	5822434	5896271	5880101	0.88
50	Wyoming	1	0	3	0.17	0.00	0.52	578759	577605	579483	0.17

Lecture 5 – Data 100, Fall 2023¶

Structure: Multiple Files¶

Gather Census Data¶

Join Data (Merge DataFrames)¶

Reproduce incidence¶

Structure: Different File Formats¶

TSV¶

JSON¶

File size¶

File contents¶

EDA: Digging into JSON¶

Examine what keys are in the top level json object¶

Examining the Data Field for Records¶

Columns Metadata¶

Summary of exploring the JSON file¶

JSON with pandas¶

Temporality¶

Data Faithfulness: Mauna Loa CO2 data¶

How do we read the file into Pandas?¶

Exploring Variable Feature Types¶

Visualizing CO2¶

Sanity Checks: Reasoning about the data¶

Understanding Missing Value 1: `Days`¶

Understanding Missing Value 2: `Avg`¶

Drop, `NaN`, or Impute Missing `Avg` Data?¶

Presenting the data: A Discussion on Data Granularity¶

	sid	id	created_at	created_meta	updated_at	updated_meta	meta	Date	New Cases	Cumulative Cases
1292	row-zg3v~6bpx~tp8j	00000000-0000-0000-7710-AECCCEB67A5A	1687370769	None	1687370769	None	{ }	2023-06-15T00:00:00	2	23384
1293	row-wean-73e3.fixn	00000000-0000-0000-27C0-D21340C3F93F	1687370769	None	1687370769	None	{ }	2023-06-16T00:00:00	4	23388
1294	row-cvez.6nyd.3f6y	00000000-0000-0000-C1AE-C76310119F30	1687370769	None	1687370769	None	{ }	2023-06-17T00:00:00	2	23390
1295	row-hzcg-24ra_7ipd	00000000-0000-0000-D048-D04E71C55E3A	1687370769	None	1687370769	None	{ }	2023-06-18T00:00:00	0	23390
1296	row-8tdn_tuw8_hujn	00000000-0000-0000-3181-E8EECFBF2249	1687370769	None	1687370769	None	{ }	2023-06-19T00:00:00	0	23390

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	21014296	THEFT MISD. (UNDER $950)	04/01/2021 12:00:00 AM	10:58	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
1	21014391	THEFT MISD. (UNDER $950)	04/01/2021 12:00:00 AM	10:38	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
2	21090494	THEFT MISD. (UNDER $950)	04/19/2021 12:00:00 AM	12:15	LARCENY	1	06/15/2021 12:00:00 AM	2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...	2100 BLOCK HASTE ST	Berkeley	CA
3	21090204	THEFT FELONY (OVER $950)	02/13/2021 12:00:00 AM	17:00	LARCENY	6	06/15/2021 12:00:00 AM	2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...	2600 BLOCK WARRING ST	Berkeley	CA
4	21090179	BURGLARY AUTO	02/08/2021 12:00:00 AM	6:20	BURGLARY - VEHICLE	1	06/15/2021 12:00:00 AM	2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...	2700 BLOCK GARBER ST	Berkeley	CA

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
0	21014296	THEFT MISD. (UNDER $950)	2021-04-01	10:58	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
1	21014391	THEFT MISD. (UNDER $950)	2021-04-01	10:38	LARCENY	4	06/15/2021 12:00:00 AM	Berkeley, CA\n(37.869058, -122.270455)	NaN	Berkeley	CA
2	21090494	THEFT MISD. (UNDER $950)	2021-04-19	12:15	LARCENY	1	06/15/2021 12:00:00 AM	2100 BLOCK HASTE ST\nBerkeley, CA\n(37.864908,...	2100 BLOCK HASTE ST	Berkeley	CA
3	21090204	THEFT FELONY (OVER $950)	2021-02-13	17:00	LARCENY	6	06/15/2021 12:00:00 AM	2600 BLOCK WARRING ST\nBerkeley, CA\n(37.86393...	2600 BLOCK WARRING ST	Berkeley	CA
4	21090179	BURGLARY AUTO	2021-02-08	6:20	BURGLARY - VEHICLE	1	06/15/2021 12:00:00 AM	2700 BLOCK GARBER ST\nBerkeley, CA\n(37.86066,...	2700 BLOCK GARBER ST	Berkeley	CA

	CASENO	OFFENSE	EVENTDT	EVENTTM	CVLEGEND	CVDOW	InDbDate	Block_Location	BLKADDR	City	State
2513	20057398	BURGLARY COMMERCIAL	2020-12-17	16:05	BURGLARY - COMMERCIAL	4	06/15/2021 12:00:00 AM	600 BLOCK GILMAN ST\nBerkeley, CA\n(37.878405,...	600 BLOCK GILMAN ST	Berkeley	CA
624	20057207	ASSAULT/BATTERY MISD.	2020-12-17	16:50	ASSAULT	4	06/15/2021 12:00:00 AM	2100 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.871...	2100 BLOCK SHATTUCK AVE	Berkeley	CA
154	20092214	THEFT FROM AUTO	2020-12-17	18:30	LARCENY - FROM VEHICLE	4	06/15/2021 12:00:00 AM	800 BLOCK SHATTUCK AVE\nBerkeley, CA\n(37.8918...	800 BLOCK SHATTUCK AVE	Berkeley	CA
659	20057324	THEFT MISD. (UNDER $950)	2020-12-17	15:44	LARCENY	4	06/15/2021 12:00:00 AM	1800 BLOCK 4TH ST\nBerkeley, CA\n(37.869888, -...	1800 BLOCK 4TH ST	Berkeley	CA
993	20057573	BURGLARY RESIDENTIAL	2020-12-17	22:15	BURGLARY - RESIDENTIAL	4	06/15/2021 12:00:00 AM	1700 BLOCK STUART ST\nBerkeley, CA\n(37.857495...	1700 BLOCK STUART ST	Berkeley	CA

	0	1	2	3	4	5	6
0	1958	3	1958.21	315.71	315.71	314.62	-1
1	1958	4	1958.29	317.45	317.45	315.29	-1
2	1958	5	1958.38	317.50	317.50	314.71	-1
3	1958	6	1958.46	-99.99	317.10	314.85	-1
4	1958	7	1958.54	315.86	315.86	314.98	-1

	Yr	Mo	DecDate	Avg	Int	Trend	Days
733	2019	4	2019.29	413.32	413.32	410.49	26
734	2019	5	2019.38	414.66	414.66	411.20	28
735	2019	6	2019.46	413.92	413.92	411.58	27
736	2019	7	2019.54	411.77	411.77	411.43	23
737	2019	8	2019.62	409.95	409.95	411.84	29

	Yr	Mo	DecDate	Avg	Int	Trend	Days
3	1958	6	1958.46	-99.99	317.10	314.85	-1
7	1958	10	1958.79	-99.99	312.66	315.61	-1
71	1964	2	1964.12	-99.99	320.07	319.61	-1
72	1964	3	1964.21	-99.99	320.73	319.55	-1
73	1964	4	1964.29	-99.99	321.77	319.48	-1
213	1975	12	1975.96	-99.99	330.59	331.60	0
313	1984	4	1984.29	-99.99	346.84	344.27	2

Lecture 5 – Data 100, Fall 2023¶

Structure: Multiple Files¶

Gather Census Data¶

Join Data (Merge DataFrames)¶

Reproduce incidence¶

Structure: Different File Formats¶

TSV¶

JSON¶

File size¶

File contents¶

EDA: Digging into JSON¶

Examine what keys are in the top level json object¶

Examining the Data Field for Records¶

Columns Metadata¶

Summary of exploring the JSON file¶

JSON with pandas¶

Temporality¶

Data Faithfulness: Mauna Loa CO2 data¶

How do we read the file into Pandas?¶

Exploring Variable Feature Types¶

Visualizing CO2¶

Sanity Checks: Reasoning about the data¶

Understanding Missing Value 1: Days¶

Understanding Missing Value 2: Avg¶

Drop, NaN, or Impute Missing Avg Data?¶

Presenting the data: A Discussion on Data Granularity¶

Understanding Missing Value 1: `Days`¶

Understanding Missing Value 2: `Avg`¶

Drop, `NaN`, or Impute Missing `Avg` Data?¶