import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
# This option stops scientific notation for pandas
pd.set_option('display.float_format', '{:.2f}'.format)


with open("data/cdc_tuberculosis.tsv", "r") as f:
    i = 0
    for row in f:
        print(repr(row)) # print raw strings
        i += 1
        if i > 3:
            break

'\tNo. of TB cases\t\t\tTB incidence\t\t\n'
'U.S. jurisdiction\t2019\t2020\t2021\t2019\t2020\t2021\n'
'Total\t"8,900"\t"7,173"\t"7,860"\t2.71\t2.16\t2.37\n'
'Alabama\t87\t72\t92\t1.77\t1.43\t1.83\n'


tuberculosis_df_tsv = pd.read_csv("data/cdc_tuberculosis.tsv", sep='\t')
tuberculosis_df_tsv.head()


# just run this cell
from ds100_utils import fetch_and_cache

from importlib import reload
reload(utils)


covid_file = fetch_and_cache(
    "https://data.cityofberkeley.info/api/views/xn6j-b766/rows.json?accessType=DOWNLOAD",
    "confirmed-cases.json",
    force=False)
covid_file          # a file path wrapper object

Using cached version that was downloaded (UTC): Tue Jan 31 14:33:04 2023

PosixPath('data/confirmed-cases.json')


import os

print(covid_file, "is", os.path.getsize(covid_file) / 1e6, "MB")

with open(covid_file, "r") as f:
    print(covid_file, "is", sum(1 for l in f), "lines.")

data/confirmed-cases.json is 0.183341 MB
data/confirmed-cases.json is 1559 lines.


with open(covid_file, "r") as f:
    i = 0
    for row in f:
        print(repr(row)) # print raw strings
        i += 1
        if i > 5:
            break

'{\n'
'  "meta" : {\n'
'    "view" : {\n'
'      "id" : "xn6j-b766",\n'
'      "name" : "COVID-19 Confirmed Cases",\n'
'      "assetType" : "dataset",\n'


import json

with open(covid_file, "rb") as f:
    covid_json = json.load(f)


type(covid_json)

dict


covid_json.keys()

dict_keys(['meta', 'data'])


covid_json['meta'].keys()

dict_keys(['view'])


covid_json['meta']['view'].keys()

dict_keys(['id', 'name', 'assetType', 'attribution', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'clientContext', 'columns', 'grants', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'tags', 'flags'])


print(covid_json['meta']['view']['description'])

Counts of confirmed COVID-19 cases among Berkeley residents by date. As of 6/21/22, this dataset will be updated weekly instead of daily. As of 11/14/22, this dataset only includes PCR cases.


for i in range(3):
    print(f"{i:03} | {covid_json['data'][i]}")

000 | ['row-2yyp~r8a3~phgq', '00000000-0000-0000-F944-963C2BD56F87', 0, 1674521616, None, 1674521616, None, '{ }', '2019-12-01T00:00:00', '0', '0']
001 | ['row-svsf_gzh2~cz9t', '00000000-0000-0000-BA8E-0D8297E66451', 0, 1674521616, None, 1674521616, None, '{ }', '2019-12-02T00:00:00', '0', '0']
002 | ['row-w244_ivf6-rdcu', '00000000-0000-0000-A608-DB4DF9DB1B16', 0, 1674521616, None, 1674521616, None, '{ }', '2019-12-03T00:00:00', '0', '0']


type(covid_json['meta']['view']['columns'])

list


# Load the data from JSON and assign column titles
covid = pd.DataFrame(
    covid_json['data'],
    columns=[c['name'] for c in covid_json['meta']['view']['columns']])

covid.tail()


co2_file = "data/co2_mm_mlo.txt"


co2 = pd.read_csv(
    co2_file, header = None, skiprows = 72,
    sep = r'\s+'       #delimiter for continuous whitespace (stay tuned for regex next lecture))
)
co2.head()


co2.head()


co2 = pd.read_csv(
    co2_file, header = None, skiprows = 72,
    sep = '\s+', #regex for continuous whitespace (next lecture)
    names = ['Yr', 'Mo', 'DecDate', 'Avg', 'Int', 'Trend', 'Days']
)
co2.head()


sns.lineplot(x='DecDate', y='Avg', data=co2);


co2.head()


co2.tail()


co2.shape

(738, 7)


co2["Mo"].value_counts().sort_index()

1     61
2     61
3     62
4     62
5     62
6     62
7     62
8     62
9     61
10    61
11    61
12    61
Name: Mo, dtype: int64


sns.displot(co2['Days']);
plt.title("Distribution of days feature")
plt.show() # suppresses unneeded plotting output


sns.scatterplot(x="Yr", y="Days", data=co2);
plt.title("Day field by Year"); # the ; suppresses output


# Histograms of average CO2 measurements
sns.displot(co2['Avg']);


co2[co2["Avg"] < 0]


sns.lineplot(x='DecDate', y='Avg', data=co2)
plt.title("CO2 Average By Month");


# 1. Drop missing values
co2_drop = co2[co2['Avg'] > 0]

# 2. Replace NaN with -99.99
co2_NA = co2.replace(-99.99, np.NaN)


# 3. Use interpolated column which estimates missing Avg values
co2_impute = co2.copy()
co2_impute['Avg'] = co2['Int']


# results of plotting data in 1958

def line_and_points(data, ax, title):
    # assumes single year, hence Mo
    ax.plot('Mo', 'Avg', data=data)
    ax.scatter('Mo', 'Avg', data=data)
    ax.set_xlim(2, 13)
    ax.set_title(title)
    ax.set_xticks(np.arange(3, 13))

def data_year(data, year):
    return data[data["Yr"] == 1958]
    
# uses matplotlib subplots
# you may see more next week; focus on output for now
fig, axes = plt.subplots(ncols = 3, figsize=(12, 4), sharey=True)

year = 1958
line_and_points(data_year(co2_drop, year), axes[0], title="1. Drop Missing")
line_and_points(data_year(co2_NA, year), axes[1], title="2. Missing Set to NaN")
line_and_points(data_year(co2_impute, year), axes[2], title="3. Missing Interpolated")

fig.suptitle(f"Monthly Averages for {year}")
plt.tight_layout()


sns.lineplot(x='DecDate', y='Avg', data=co2_impute)
plt.title("CO2 Average By Month, Imputed");


co2_year = co2_impute.groupby('Yr').mean()
sns.lineplot(x='Yr', y='Avg', data=co2_year)
plt.title("CO2 Average By Year");

	sid	id	created_at	created_meta	updated_at	updated_meta	meta	Date	New Cases	Cumulative Cases
1144	row-vvwh-m4y7.mr6x	00000000-0000-0000-8DE9-36E51F431D12	1674521616	None	1674521616	None	{ }	2023-01-18T00:00:00	6	22182
1145	row-zex5.4t7g-ehhz	00000000-0000-0000-7A55-165F57F862F6	1674521616	None	1674521616	None	{ }	2023-01-19T00:00:00	0	22182
1146	row-gh6h_y9cb_6knb	00000000-0000-0000-E039-77F6B2766B82	1674521616	None	1674521616	None	{ }	2023-01-20T00:00:00	0	22182
1147	row-cb4r~rtkn.xg9x	00000000-0000-0000-CCC4-D0C4B5D12422	1674521616	None	1674521616	None	{ }	2023-01-21T00:00:00	0	22182
1148	row-4en9-p4vi.fq5m	00000000-0000-0000-7F40-89071F462EE8	1674521616	None	1674521616	None	{ }	2023-01-22T00:00:00	0	22182

	0	1	2	3	4	5	6
0	1958	3	1958.21	315.71	315.71	314.62	-1
1	1958	4	1958.29	317.45	317.45	315.29	-1
2	1958	5	1958.38	317.50	317.50	314.71	-1
3	1958	6	1958.46	-99.99	317.10	314.85	-1
4	1958	7	1958.54	315.86	315.86	314.98	-1

	0	1	2	3	4	5	6
0	1958	3	1958.21	315.71	315.71	314.62	-1
1	1958	4	1958.29	317.45	317.45	315.29	-1
2	1958	5	1958.38	317.50	317.50	314.71	-1
3	1958	6	1958.46	-99.99	317.10	314.85	-1
4	1958	7	1958.54	315.86	315.86	314.98	-1

	Yr	Mo	DecDate	Avg	Int	Trend	Days
0	1958	3	1958.21	315.71	315.71	314.62	-1
1	1958	4	1958.29	317.45	317.45	315.29	-1
2	1958	5	1958.38	317.50	317.50	314.71	-1
3	1958	6	1958.46	-99.99	317.10	314.85	-1
4	1958	7	1958.54	315.86	315.86	314.98	-1

	Yr	Mo	DecDate	Avg	Int	Trend	Days
0	1958	3	1958.21	315.71	315.71	314.62	-1
1	1958	4	1958.29	317.45	317.45	315.29	-1
2	1958	5	1958.38	317.50	317.50	314.71	-1
3	1958	6	1958.46	-99.99	317.10	314.85	-1
4	1958	7	1958.54	315.86	315.86	314.98	-1

Lecture 5 – Data 100, Spring 2023¶

Different File Types¶

TSV¶

JSON¶

Reproducible Data Science¶

File size¶

File contents¶

EDA: Digging into JSON¶

Examine what keys are in the top level json object¶

Examining the Data Field for Records¶

Columns Metadata¶

Summary of exploring the JSON file¶

3. Finally, read data into a pandas DataFrame¶

Data Faithfulness: Mauna Loa CO2 data¶

How do we read the file into Pandas?¶

¶

Real World Example: Wrangling CO2 Measurements¶

Exploring Variable Feature Types¶

Let's start exploring!!¶

Quality Checks: Reasoning about the data¶

Understanding Missing Value 1: `Days`¶

Understanding Missing Value 2: `Avg`¶

Drop, NaN, or Impute Missing `Avg` Data?¶

Presenting the data: A Discussion on Data Granularity¶

	Unnamed: 0	No. of TB cases	Unnamed: 2	Unnamed: 3	TB incidence	Unnamed: 5	Unnamed: 6
0	U.S. jurisdiction	2019	2020	2021	2019.00	2020.00	2021.00
1	Total	8,900	7,173	7,860	2.71	2.16	2.37
2	Alabama	87	72	92	1.77	1.43	1.83
3	Alaska	58	58	58	7.91	7.92	7.92
4	Arizona	183	136	129	2.51	1.89	1.77

	Yr	Mo	DecDate	Avg	Int	Trend	Days
733	2019	4	2019.29	413.32	413.32	410.49	26
734	2019	5	2019.38	414.66	414.66	411.20	28
735	2019	6	2019.46	413.92	413.92	411.58	27
736	2019	7	2019.54	411.77	411.77	411.43	23
737	2019	8	2019.62	409.95	409.95	411.84	29

	Yr	Mo	DecDate	Avg	Int	Trend	Days
3	1958	6	1958.46	-99.99	317.10	314.85	-1
7	1958	10	1958.79	-99.99	312.66	315.61	-1
71	1964	2	1964.12	-99.99	320.07	319.61	-1
72	1964	3	1964.21	-99.99	320.73	319.55	-1
73	1964	4	1964.29	-99.99	321.77	319.48	-1
213	1975	12	1975.96	-99.99	330.59	331.60	0
313	1984	4	1984.29	-99.99	346.84	344.27	2

Lecture 5 – Data 100, Spring 2023¶

Different File Types¶

TSV¶

JSON¶

Reproducible Data Science¶

File size¶

File contents¶

EDA: Digging into JSON¶

Examine what keys are in the top level json object¶

Examining the Data Field for Records¶

Columns Metadata¶

Summary of exploring the JSON file¶

3. Finally, read data into a pandas DataFrame¶

Data Faithfulness: Mauna Loa CO2 data¶

How do we read the file into Pandas?¶

¶

Real World Example: Wrangling CO2 Measurements¶

Exploring Variable Feature Types¶

Let's start exploring!!¶

Quality Checks: Reasoning about the data¶

Understanding Missing Value 1: Days¶

Understanding Missing Value 2: Avg¶

Drop, NaN, or Impute Missing Avg Data?¶

Presenting the data: A Discussion on Data Granularity¶

Understanding Missing Value 1: `Days`¶

Understanding Missing Value 2: `Avg`¶

Drop, NaN, or Impute Missing `Avg` Data?¶