import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
sns.set_context('talk')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
# This option stops scientific notation for pandas
pd.set_option('display.float_format', '{:.2f}'.format)

# Silence some spurious seaborn warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# !pip install lxml
# tbls = pd.read_html("https://www.cdc.gov/mmwr/volumes/71/wr/mm7112a1.htm?s_cid=mm7112a1_w#T1_down")
# df = tbls[0] # First table on the website
# df


with open("data/cdc_tuberculosis.csv", "r") as f:
    for i, row in enumerate(f):
        print(row)
        if i >= 3: break

,No. of TB cases,,,TB incidence,,

U.S. jurisdiction,2019,2020,2021,2019,2020,2021

Total,"8,900","7,173","7,860",2.71,2.16,2.37

Alabama,87,72,92,1.77,1.43,1.83


with open("data/cdc_tuberculosis.csv", "r") as f:
    for i, row in enumerate(f):
        print(repr(row)) # print raw strings
        if i >= 3: break

',No. of TB cases,,,TB incidence,,\n'
'U.S. jurisdiction,2019,2020,2021,2019,2020,2021\n'
'Total,"8,900","7,173","7,860",2.71,2.16,2.37\n'
'Alabama,87,72,92,1.77,1.43,1.83\n'


with open("data/cdc_tuberculosis.csv", "r") as f:
    for row in f.readlines()[:4]:
        print(repr(row)) # print raw strings

',No. of TB cases,,,TB incidence,,\n'
'U.S. jurisdiction,2019,2020,2021,2019,2020,2021\n'
'Total,"8,900","7,173","7,860",2.71,2.16,2.37\n'
'Alabama,87,72,92,1.77,1.43,1.83\n'


with open("data/cdc_tuberculosis.tsv", "r") as f:
    for _, row in zip(range(4), f):
        print(repr(row)) # print raw strings

'\tNo. of TB cases\t\t\tTB incidence\t\t\n'
'U.S. jurisdiction\t2019\t2020\t2021\t2019\t2020\t2021\n'
'Total\t"8,900"\t"7,173"\t"7,860"\t2.71\t2.16\t2.37\n'
'Alabama\t87\t72\t92\t1.77\t1.43\t1.83\n'


tb_df = pd.read_csv("data/cdc_tuberculosis.csv",)
tb_df


tb_df = pd.read_csv("data/cdc_tuberculosis.csv", header=1) # row index
tb_df


rename_dict = {'2019': 'TB cases 2019',
               '2020': 'TB cases 2020',
               '2021': 'TB cases 2021',
               '2019.1': 'TB incidence 2019',
               '2020.1': 'TB incidence 2020',
               '2021.1': 'TB incidence 2021'}
tb_df = tb_df.rename(columns=rename_dict)
tb_df


tb_df.head()


tb_df.drop(0)


tb_df.drop(0).sum()

U.S. jurisdiction    AlabamaAlaskaArizonaArkansasCaliforniaColorado...
TB cases 2019        8758183642,11166671824558302997326108523766881...
TB cases 2020        7258136591,70652541719412221928216923937679917...
TB cases 2021        9258129691,75058544319499228106425512749435786...
TB incidence 2019                                               107.23
TB incidence 2020                                                90.93
TB incidence 2021                                               100.57
dtype: object


tb_df.dtypes

U.S. jurisdiction     object
TB cases 2019         object
TB cases 2020         object
TB cases 2021         object
TB incidence 2019    float64
TB incidence 2020    float64
TB incidence 2021    float64
dtype: object


# improve readability: chaining method calls with outer parentheses/line breaks
tb_df = (
    pd.read_csv("data/cdc_tuberculosis.csv", header=1, thousands=',')
    .rename(columns=rename_dict)
)
tb_df


tb_df.drop(0).sum()

U.S. jurisdiction    AlabamaAlaskaArizonaArkansasCaliforniaColorado...
TB cases 2019                                                     8900
TB cases 2020                                                     7173
TB cases 2021                                                     7860
TB incidence 2019                                               107.23
TB incidence 2020                                                90.93
TB incidence 2021                                               100.57
dtype: object


tb_df.head(1)


census_2010s_df = pd.read_csv("data/nst-est2019-01.csv", header=3, thousands=",")
census_2010s_df


census_2010s_df = (
    census_2010s_df
    .rename(columns={"Unnamed: 0": "Geographic Area"})
    .drop(columns=["Census", "Estimates Base"])
    .convert_dtypes() # "smart" converting of columns to int, use at your own risk
    .dropna()  # we'll introduce this very soon
)
census_2010s_df


census_2010s_df['Geographic Area'] = census_2010s_df['Geographic Area'].str.strip('.')
census_2010s_df


# census 2020s data
census_2020s_df = pd.read_csv("data/NST-EST2022-POP.csv", header=3, thousands=",")
census_2020s_df = (
    census_2020s_df
    .drop(columns=["Unnamed: 1"])
    .rename(columns={"Unnamed: 0": "Geographic Area"})
    .convert_dtypes()                 
    .dropna()                         
)
census_2020s_df['Geographic Area'] = census_2020s_df['Geographic Area'].str.strip('.')
census_2020s_df


# Show the three tables that we are going to join
display(tb_df.tail(2))
display(census_2010s_df.tail(2))
display(census_2020s_df.tail(2))


# merge TB dataframe with two US census dataframes
tb_census_df = (
    tb_df
    .merge(right=census_2010s_df,
           left_on="U.S. jurisdiction", right_on="Geographic Area")
    .merge(right=census_2020s_df,
           left_on="U.S. jurisdiction", right_on="Geographic Area")
)
tb_census_df.tail()


# try merging again, but cleaner this time
tb_census_df = (
    tb_df
    .merge(right=census_2010s_df[["Geographic Area", "2019"]],
           left_on="U.S. jurisdiction", right_on="Geographic Area")
    .drop(columns="Geographic Area")
    .merge(right=census_2020s_df[["Geographic Area", "2020", "2021"]],
           left_on="U.S. jurisdiction", right_on="Geographic Area")
    .drop(columns="Geographic Area")
)
tb_census_df.tail()


tb_census_df["recompute incidence 2019"] = (
    tb_census_df["TB cases 2019"]/tb_census_df["2019"] * 100_000
)
tb_census_df


# recompute incidence for all years
for year in [2019, 2020, 2021]:
    tb_census_df[f"recompute incidence {year}"] = (
        tb_census_df[f"TB cases {year}"]/tb_census_df[f"{year}"]*100_000
    )
tb_census_df


tb_census_df.describe()


tb_df = tb_df.set_index("U.S. jurisdiction")
tb_df


census_2010s_df = census_2010s_df.set_index("Geographic Area")
census_2010s_df


census_2020s_df = census_2020s_df.set_index("Geographic Area")
census_2020s_df


tb_df.head()


census_2010s_df


# rename rolled record for 2010s
census_2010s_df.rename(index={'United States':'Total'}, inplace=True)
census_2010s_df


# same, but for 2020s rename rolled record
census_2020s_df.rename(index={'United States':'Total'}, inplace=True)
census_2020s_df


tb_census_df = (
    tb_df
    .merge(right=census_2010s_df[["2019"]],
           left_index=True, right_index=True)
    .merge(right=census_2020s_df[["2020", "2021"]],
           left_index=True, right_index=True)
)
tb_census_df


# recompute incidence for all years
for year in [2019, 2020, 2021]:
    tb_census_df[f"recompute incidence {year}"] = tb_census_df[f"TB cases {year}"]/tb_census_df[f"{year}"]*100000
tb_census_df


tb_census_df


incidence_2020 = tb_census_df.loc['Total', 'recompute incidence 2020']
incidence_2020

2.1637257652759883


incidence_2021 = tb_census_df.loc['Total', 'recompute incidence 2021']
incidence_2021

2.3672448914298068


difference = (incidence_2021 - incidence_2020)/incidence_2020 * 100
difference

9.405957511804143

	Unnamed: 0	Census	Estimates Base	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019
0	United States	308745538.00	308758105.00	309321666.00	311556874.00	313830990.00	315993715.00	318301008.00	320635163.00	322941311.00	324985539.00	326687501.00	328239523.00
1	Northeast	55317240.00	55318443.00	55380134.00	55604223.00	55775216.00	55901806.00	56006011.00	56034684.00	56042330.00	56059240.00	56046620.00	55982803.00
2	Midwest	66927001.00	66929725.00	66974416.00	67157800.00	67336743.00	67560379.00	67745167.00	67860583.00	67987540.00	68126781.00	68236628.00	68329004.00
3	South	114555744.00	114563030.00	114866680.00	116006522.00	117241208.00	118364400.00	119624037.00	120997341.00	122351760.00	123542189.00	124569433.00	125580448.00
4	West	71945553.00	71946907.00	72100436.00	72788329.00	73477823.00	74167130.00	74925793.00	75742555.00	76559681.00	77257329.00	77834820.00	78347268.00
...	...	...	...	...	...	...	...	...	...	...	...	...	...
58	Note: The estimates are based on the 2010 Cens...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
59	Suggested Citation:	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
60	Table 1. Annual Estimates of the Resident Popu...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
61	Source: U.S. Census Bureau, Population Division	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
62	Release Date: December 2019	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Geographic Area	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019
0	United States	309321666	311556874	313830990	315993715	318301008	320635163	322941311	324985539	326687501	328239523
1	Northeast	55380134	55604223	55775216	55901806	56006011	56034684	56042330	56059240	56046620	55982803
2	Midwest	66974416	67157800	67336743	67560379	67745167	67860583	67987540	68126781	68236628	68329004
3	South	114866680	116006522	117241208	118364400	119624037	120997341	122351760	123542189	124569433	125580448
4	West	72100436	72788329	73477823	74167130	74925793	75742555	76559681	77257329	77834820	78347268
...	...	...	...	...	...	...	...	...	...	...	...
52	.Washington	6742830	6826627	6897058	6963985	7054655	7163657	7294771	7423362	7523869	7614893
53	.West Virginia	1854239	1856301	1856872	1853914	1849489	1842050	1831023	1817004	1804291	1792147
54	.Wisconsin	5690475	5705288	5719960	5736754	5751525	5760940	5772628	5790186	5807406	5822434
55	.Wyoming	564487	567299	576305	582122	582531	585613	584215	578931	577601	578759
57	Puerto Rico	3721525	3678732	3634488	3593077	3534874	3473232	3406672	3325286	3193354	3193694

	Geographic Area	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019
0	United States	309321666	311556874	313830990	315993715	318301008	320635163	322941311	324985539	326687501	328239523
1	Northeast	55380134	55604223	55775216	55901806	56006011	56034684	56042330	56059240	56046620	55982803
2	Midwest	66974416	67157800	67336743	67560379	67745167	67860583	67987540	68126781	68236628	68329004
3	South	114866680	116006522	117241208	118364400	119624037	120997341	122351760	123542189	124569433	125580448
4	West	72100436	72788329	73477823	74167130	74925793	75742555	76559681	77257329	77834820	78347268
...	...	...	...	...	...	...	...	...	...	...	...
52	Washington	6742830	6826627	6897058	6963985	7054655	7163657	7294771	7423362	7523869	7614893
53	West Virginia	1854239	1856301	1856872	1853914	1849489	1842050	1831023	1817004	1804291	1792147
54	Wisconsin	5690475	5705288	5719960	5736754	5751525	5760940	5772628	5790186	5807406	5822434
55	Wyoming	564487	567299	576305	582122	582531	585613	584215	578931	577601	578759
57	Puerto Rico	3721525	3678732	3634488	3593077	3534874	3473232	3406672	3325286	3193354	3193694

	Geographic Area	2020	2021	2022
0	United States	331511512	332031554	333287557
1	Northeast	57448898	57259257	57040406
2	Midwest	68961043	68836505	68787595
3	South	126450613	127346029	128716192
4	West	78650958	78589763	78743364
...	...	...	...	...
52	Washington	7724031	7740745	7785786
53	West Virginia	1791420	1785526	1775156
54	Wisconsin	5896271	5880101	5892539
55	Wyoming	577605	579483	581381
57	Puerto Rico	3281557	3262693	3221789

	Geographic Area	2010	2011	2012	2013	2014	2015	2016	2017	2018	2019
55	Wyoming	564487	567299	576305	582122	582531	585613	584215	578931	577601	578759
57	Puerto Rico	3721525	3678732	3634488	3593077	3534874	3473232	3406672	3325286	3193354	3193694

Lecture 5 (Part 1 Tuberculosis) – Data 100, Spring 2024¶

Tuberculosis in the United States¶

CSV and Nice Field Names¶

Play with the data in the Jupyter Lab Explorer¶

Play with the data in python¶

A brief tantent on reading files (optional)¶

Record Granularity¶

Gather Census Data¶

Loading the 2020s data¶

Join Data (Merge DataFrames)¶

Reproduce incidence¶

Bonus EDA¶

	Unnamed: 0	No. of TB cases	Unnamed: 2	Unnamed: 3	TB incidence	Unnamed: 5	Unnamed: 6
0	U.S. jurisdiction	2019	2020	2021	2019.00	2020.00	2021.00
1	Total	8,900	7,173	7,860	2.71	2.16	2.37
2	Alabama	87	72	92	1.77	1.43	1.83
3	Alaska	58	58	58	7.91	7.92	7.92
4	Arizona	183	136	129	2.51	1.89	1.77
...	...	...	...	...	...	...	...
48	Virginia	191	169	161	2.23	1.96	1.86
49	Washington	221	163	199	2.90	2.11	2.57
50	West Virginia	9	13	7	0.50	0.73	0.39
51	Wisconsin	51	35	66	0.88	0.59	1.12
52	Wyoming	1	0	3	0.17	0.00	0.52

	U.S. jurisdiction	TB cases 2019	TB cases 2020	TB cases 2021	TB incidence 2019	TB incidence 2020	TB incidence 2021
0	Total	8900	7173	7860	2.71	2.16	2.37
1	Alabama	87	72	92	1.77	1.43	1.83
2	Alaska	58	58	58	7.91	7.92	7.92
3	Arizona	183	136	129	2.51	1.89	1.77
4	Arkansas	64	59	69	2.12	1.96	2.28
...	...	...	...	...	...	...	...
47	Virginia	191	169	161	2.23	1.96	1.86
48	Washington	221	163	199	2.90	2.11	2.57
49	West Virginia	9	13	7	0.50	0.73	0.39
50	Wisconsin	51	35	66	0.88	0.59	1.12
51	Wyoming	1	0	3	0.17	0.00	0.52

	TB cases 2019	TB cases 2020	TB cases 2021	TB incidence 2019	TB incidence 2020	TB incidence 2021	2019	2020	2021	recompute incidence 2019	recompute incidence 2020	recompute incidence 2021
count	51.00	51.00	51.00	51.00	51.00	51.00	51.00	51.00	51.00	51.00	51.00	51.00
mean	174.51	140.65	154.12	2.10	1.78	1.97	6436069.08	6500225.73	6510422.63	2.10	1.78	1.97
std	341.74	271.06	286.78	1.50	1.34	1.48	7360660.47	7408168.46	7394300.08	1.50	1.34	1.47
min	1.00	0.00	2.00	0.17	0.00	0.21	578759.00	577605.00	579483.00	0.17	0.00	0.21
25%	25.50	29.00	23.00	1.29	1.21	1.23	1789606.00	1820311.00	1844920.00	1.30	1.21	1.23
50%	70.00	67.00	69.00	1.80	1.52	1.70	4467673.00	4507445.00	4506589.00	1.81	1.52	1.69
75%	180.50	139.00	150.00	2.58	1.99	2.22	7446805.00	7451987.00	7502811.00	2.58	1.99	2.22
max	2111.00	1706.00	1750.00	7.91	7.92	7.92	39512223.00	39501653.00	39142991.00	7.93	7.91	7.90