import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


wb = pd.read_csv("data/world_bank.csv", index_col=0)
wb = wb.rename(columns={'Antiretroviral therapy coverage: % of people living with HIV: 2015':"HIV rate",
                       'Gross national income per capita, Atlas method: $: 2016':'gni'})
wb.head()


# The `edgecolor` argument controls the color of the bin edges
plt.hist(wb['gni'], density=True, edgecolor="white")

# Add labels
plt.xlabel("Gross national income per capita")
plt.ylabel("Density")
plt.title("Distribution of gross national income per capita");


sns.histplot(data=wb, x="gni", stat="density")
plt.title("Distribution of gross national income per capita");


# With 5 bins, it seems that there is only one peak
sns.histplot(data=wb, x="HIV rate", stat="density", bins=5)
plt.title("5 histogram bins");


# With 10 bins, there seem to be two peaks
sns.histplot(data=wb, x='HIV rate', stat="density", bins=10)
plt.title("10 histogram bins");


# And with 20 bins, it becomes hard to say what counts as a "peak"!
sns.histplot(data=wb, x='HIV rate', stat="density", bins=20)
plt.title("20 histogram bins");


# Create a new variable to store the hemisphere in which each country is located
north = ["Asia", "Europe", "N. America"]
south = ["Africa", "Oceania", "S. America"]
wb.loc[wb["Continent"].isin(north), "Hemisphere"] = "Northern"
wb.loc[wb["Continent"].isin(south), "Hemisphere"] = "Southern"


sns.histplot(data=wb, x="gni", 
             hue="Hemisphere", stat="density")
plt.title("Distribution of gross national income per capita");


points = [2.2, 2.8, 3.7, 5.3, 5.7]


plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);


def gaussian(x, z, a):
    # Gaussian kernel
    return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))

def boxcar_basic(x, z, a):
    # Boxcar kernel
    if np.abs(x - z) <= a/2:
        return 1/a
    return 0

def boxcar(x, z, a):
    # Boxcar kernel
    cond = np.abs(x - z)
    return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )


def create_kde(kernel, pts, a):
    # Takes in a kernel, set of points, and alpha
    # Returns the KDE as a function
    def f(x):
        output = 0
        for pt in pts:
            output += kernel(x, pt, a)
        return output / len(pts) # Normalization factor
    return f

def plot_kde(kernel, pts, a):
    # Calls create_kde and plots the corresponding KDE
    f = create_kde(kernel, pts, a)
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    y = [f(xi) for xi in x]
    plt.plot(x, y);
    
def plot_separate_kernels(kernel, pts, a, norm=False):
    # Plots individual kernels, which are then summed to create the KDE
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    for pt in pts:
        y = kernel(x, pt, a)
        if norm:
            y /= len(pts)
        plt.plot(x, y)
    
    plt.show();


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height=0.5);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)


sns.kdeplot(points, bw_method=0.65)  # magic value!
sns.histplot(points, stat='density', bins=2);


sns.histplot(points, bins=2, kde=True, stat='density', 
             kde_kws=dict(cut=4, bw_method=0.65));


sns.kdeplot(points, bw_adjust=1)
sns.histplot(points, stat='density');


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)


vals = wb['HIV rate'].dropna()
ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);


plt.figure(figsize=(8, 5))
plt.ylim(0, 0.25)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)


plt.figure(figsize=(8, 5))
plt.ylim(0, 0.05)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)


plt.figure(figsize=(8, 5))
plt.ylim(0, 0.04)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)


plt.figure(figsize=(8, 5))
plt.ylim(0, 0.04)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)


sns.displot(data=wb, 
            x="gni", 
            kind="hist", 
            stat="density") # default: stat=count and density integrates to 1
plt.title("Distribution of gross national income per capita");


sns.displot(data=wb, 
            x="gni", 
            kind='kde')
plt.title("Distribution of gross national income per capita");


sns.displot(data=wb, 
            x="gni", 
            kind='ecdf')
plt.title("Cumulative Distribution of gross national income per capita");


plt.scatter(wb['per capita: % growth: 2016'], wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'])
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");


sns.scatterplot(data=wb, x='per capita: % growth: 2016', \
                y='Adult literacy rate: Female: % ages 15 and older: 2005-14', hue="Continent")
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");


random_x_noise = np.random.uniform(-1, 1, len(wb))
random_y_noise = np.random.uniform(-5, 5, len(wb))

plt.scatter(wb['per capita: % growth: 2016']+random_x_noise, \
            wb['Adult literacy rate: Female: % ages 15 and older: 2005-14']+random_y_noise, s=15)

plt.xlabel("% growth per capita (jittered)")
plt.ylabel("Female adult literacy rate (jittered)");


sns.lmplot(data=wb, x='per capita: % growth: 2016', \
           y='Adult literacy rate: Female: % ages 15 and older: 2005-14');


sns.jointplot(data=wb, x='per capita: % growth: 2016', \
              y='Adult literacy rate: Female: % ages 15 and older: 2005-14');


sns.jointplot(data=wb, x='per capita: % growth: 2016', \
              y='Adult literacy rate: Female: % ages 15 and older: 2005-14',
              kind='hex');


sns.kdeplot(data=wb, x='per capita: % growth: 2016', \
              y='Adult literacy rate: Female: % ages 15 and older: 2005-14', fill=True);


sns.jointplot(data=wb, x='per capita: % growth: 2016', \
              y='Adult literacy rate: Female: % ages 15 and older: 2005-14',
              kind='kde');


# Some data cleaning to help with the next example

df = pd.DataFrame(index=wb.index)
df['lit'] = wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'] \
            + wb["Adult literacy rate: Male: % ages 15 and older: 2005-14"]
df['inc'] = wb['gni']
df.dropna(inplace=True)

plt.scatter(df["inc"], df["lit"])
plt.xlabel("Gross national income per capita")
plt.ylabel("Adult literacy rate");


# np.log compute the natural (base e) logarithm
plt.scatter(np.log(df["inc"]), df["lit"])
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate");


plt.scatter(np.log(df["inc"]), df["lit"]**4)
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)");


# The code below fits a linear regression model. We'll discuss it at length in a future lecture
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(np.log(df[["inc"]]), df["lit"]**4)
m, b = model.coef_[0], model.intercept_

print(f"The slope, m, of the transformed data is: {m}")
print(f"The intercept, b, of the transformed data is: {b}")

df = df.sort_values("inc")
plt.scatter(np.log(df["inc"]), df["lit"]**4, label="Transformed data")
plt.plot(np.log(df["inc"]), m*np.log(df["inc"])+b, c="red", label="Linear regression")
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)")
plt.legend();

The slope, m, of the transformed data is: 336400693.43172705
The intercept, b, of the transformed data is: -1802204836.0479987


# Now, plug the values for m and b into the relationship between the untransformed x and y
plt.scatter(df["inc"], df["lit"], label="Untransformed data")
plt.plot(df["inc"], (m*np.log(df["inc"])+b)**(1/4), c="red", label="Modeled relationship")
plt.xlabel("Gross national income per capita")
plt.ylabel("Adult literacy rate")
plt.legend();


ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]), 
                    index=pd.Series([2006, 2013], 
                    name="Year"))
ppdf


ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");


rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change

Cancer     -53.39312
Abortion    12.85591
Name: Percent Change, dtype: float64


ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");


cps = pd.read_csv("data/edInc2.csv")
cps


cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps


# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
    ax = sns.pointplot(data=cps, x="Education", y="Income", hue="Gender")

ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");


cps.head()


cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", axis="columns")
women = cg.get_group("Women").drop("Gender", axis="columns")
display(men, women)


mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio


ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");


fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio


ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");


co2 = pd.read_csv("data/CAITcountryCO2.csv", skiprows = 2,
                  names = ["Country", "Year", "CO2"], encoding = "ISO-8859-1")
co2.tail()


last_year = co2.Year.iloc[-1]
last_year

2012


q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty


top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()

14


from cycler import cycler

linestyles = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)

fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(color_c * lines_c)

x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
    ax.semilogy(df[x], df[y], label=name)

ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);

	Continent	Country	Primary completion rate: Male: % of relevant age group: 2015	Primary completion rate: Female: % of relevant age group: 2015	Lower secondary completion rate: Male: % of relevant age group: 2015	Lower secondary completion rate: Female: % of relevant age group: 2015	Youth literacy rate: Male: % of ages 15-24: 2005-14	Youth literacy rate: Female: % of ages 15-24: 2005-14	Adult literacy rate: Male: % ages 15 and older: 2005-14	Adult literacy rate: Female: % ages 15 and older: 2005-14	...	Access to improved sanitation facilities: % of population: 1990	Access to improved sanitation facilities: % of population: 2015	Child immunization rate: Measles: % of children ages 12-23 months: 2015	Child immunization rate: DTP3: % of children ages 12-23 months: 2015	Children with acute respiratory infection taken to health provider: % of children under age 5 with ARI: 2009-2016	Children with diarrhea who received oral rehydration and continuous feeding: % of children under age 5 with diarrhea: 2009-2016	Children sleeping under treated bed nets: % of children under age 5: 2009-2016	Children with fever receiving antimalarial drugs: % of children under age 5 with fever: 2009-2016	Tuberculosis: Treatment success rate: % of new cases: 2014	Tuberculosis: Cases detection rate: % of new estimated cases: 2015
0	Africa	Algeria	106.0	105.0	68.0	85.0	96.0	92.0	83.0	68.0	...	80.0	88.0	95.0	95.0	66.0	42.0	NaN	NaN	88.0	80.0
1	Africa	Angola	NaN	NaN	NaN	NaN	79.0	67.0	82.0	60.0	...	22.0	52.0	55.0	64.0	NaN	NaN	25.9	28.3	34.0	64.0
2	Africa	Benin	83.0	73.0	50.0	37.0	55.0	31.0	41.0	18.0	...	7.0	20.0	75.0	79.0	23.0	33.0	72.7	25.9	89.0	61.0
3	Africa	Botswana	98.0	101.0	86.0	87.0	96.0	99.0	87.0	89.0	...	39.0	63.0	97.0	95.0	NaN	NaN	NaN	NaN	77.0	62.0
5	Africa	Burundi	58.0	66.0	35.0	30.0	90.0	88.0	89.0	85.0	...	42.0	48.0	93.0	94.0	55.0	43.0	53.8	25.4	91.0	51.0

	Country	Year	CO2
30639	Vietnam	2012	173.0497
30640	World	2012	33843.0497
30641	Yemen	2012	20.5386
30642	Zambia	2012	2.7600
30643	Zimbabwe	2012	9.9800

	Country	Year	CO2
30490	China	2012	9312.5329
30634	United States	2012	5122.9094
30514	European Union (28)	2012	3610.5137
30533	India	2012	2075.1808
30596	Russian Federation	2012	1721.5376
30541	Japan	2012	1249.2135
30521	Germany	2012	773.9585
30547	Korea, Rep. (South)	2012	617.2418
30535	Iran	2012	593.8195
30485	Canada	2012	543.0242
30603	Saudi Arabia	2012	480.2278
30478	Brazil	2012	477.7701
30633	United Kingdom	2012	463.4556
30567	Mexico	2012	460.4782

	Country	Year	CO2
18822	Brazil	1950	19.6574
18829	Canada	1950	154.1408
18834	China	1950	78.6478
18858	European Union (28)	1950	1773.6864
18865	Germany	1950	510.7323

Lecture 8 – Data 100, Fall 2023¶

Histograms¶

Toy KDE Example¶

Step 1: Place a kernel at each point¶

Step 2: Normalize kernels so that total area is 1¶

Step 3: Sum all kernels together¶

Kernels¶

Effect of bandwidth hyperparameter $\alpha$¶

KDE Formula¶

Diving into `displot`¶

Scatter Plots¶

Hex Plots¶

Contour Plots¶

Transformations¶

Scale¶

Current Population Survey¶

CO2 Emissions¶

	educ	gender	income
0	1	Men	517
1	1	Women	409
2	2	Men	751
3	2	Women	578
4	3	Men	872
5	3	Women	661
6	4	Men	1249
7	4	Women	965
8	5	Men	1385
9	5	Women	1049

	Income Ratio (M/F)
Education
<HS	1.264059
HS	1.299308
<BA	1.319213
BA	1.294301
>BA	1.320305

	Income Ratio (F/M)
Education
<HS	0.791103
HS	0.769640
<BA	0.758028
BA	0.772618
>BA	0.757401

	Cancer	Abortion
Year
2006	2007371	289750
2013	935573	327000

Lecture 8 – Data 100, Fall 2023¶

Histograms¶

Toy KDE Example¶

Step 1: Place a kernel at each point¶

Step 2: Normalize kernels so that total area is 1¶

Step 3: Sum all kernels together¶

Kernels¶

Effect of bandwidth hyperparameter $\alpha$¶

KDE Formula¶

Diving into displot¶

Scatter Plots¶

Hex Plots¶

Contour Plots¶

Transformations¶

Scale¶

Current Population Survey¶

CO2 Emissions¶

Diving into `displot`¶