import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


wb = pd.read_csv("data/world_bank.csv", index_col=0)
wb.head()


wb.shape

(166, 47)


wb["Continent"].value_counts()

Africa        47
Europe        43
Asia          34
N. America    18
Oceania       13
S. America    11
Name: Continent, dtype: int64

from datascience import Table
t = Table.from_df(wb["Continent"].value_counts().reset_index())
t.barh("index", "Continent")


continents = wb["Continent"].value_counts()
plt.bar(continents.index, continents);


plt.bar(continents.index, continents)

<BarContainer object of 6 artists>


sns.countplot(data = wb, x = 'Continent');


import plotly.express as px
px.histogram(wb, x = 'Continent', color = 'Continent')


sns.countplot(data = wb, x = 'Gross national income per capita, Atlas method: $: 2016');


sns.boxplot(data=wb, y="Gross national income per capita, Atlas method: $: 2016");


sns.violinplot(data=wb, y="Gross national income per capita, Atlas method: $: 2016");


gdp = wb['Gross domestic product: % growth : 2016']
gdp = gdp[~gdp.isna()]

q1, q2, q3 = np.percentile(gdp, [25, 50, 75])

wb_quartiles = wb.copy()
wb_quartiles['category'] = None
wb_quartiles.loc[(wb_quartiles['Gross domestic product: % growth : 2016'] < q1) | (wb_quartiles['Gross domestic product: % growth : 2016'] > q3), 'category'] = 'Outside of the middle 50%'
wb_quartiles.loc[(wb_quartiles['Gross domestic product: % growth : 2016'] > q1) & (wb_quartiles['Gross domestic product: % growth : 2016'] < q3), 'category'] = 'In the middle 50%'

sns.histplot(wb_quartiles, x="Gross domestic product: % growth : 2016", hue="category")
sns.rugplot([q1, q2, q3], c="firebrick", lw=6, height=0.1);


sns.boxplot(data=wb, y='Gross domestic product: % growth : 2016');


sns.violinplot(data=wb, y='Gross domestic product: % growth : 2016');


sns.boxplot(data=wb, x="Continent", y='Gross domestic product: % growth : 2016');


# The `edgecolor` argument controls the color of the bin edges
gni = wb["Gross national income per capita, Atlas method: $: 2016"]
plt.hist(gni, density=True, edgecolor="white")

# Add labels
plt.xlabel("Gross national income per capita")
plt.ylabel("Density")
plt.title("Distribution of gross national income per capita");


sns.histplot(data = wb, x = "Gross national income per capita, Atlas method: $: 2016", stat="density")
plt.title("Distribution of gross national income per capita");


# Create a new variable to store the hemisphere in which each country is located
north = ["Asia", "Europe", "N. America"]
south = ["Africa", "Oceania", "S. America"]
wb.loc[wb["Continent"].isin(north), "Hemisphere"] = "Northern"
wb.loc[wb["Continent"].isin(south), "Hemisphere"] = "Southern"


sns.histplot(data = wb, x = "Gross national income per capita, Atlas method: $: 2016", hue="Hemisphere", stat="density")
plt.title("Distribution of gross national income per capita");


densities, bins, _ = plt.hist(gni, density=True, edgecolor="white", bins=5)
plt.xlabel("Gross national income per capita")
plt.ylabel("Density")

print(f"First bin has width {bins[1]-bins[0]} and height {densities[0]}")
print(f"This corresponds to {bins[1]-bins[0]} * {densities[0]} = {(bins[1]-bins[0])*densities[0]*100}% of the data")

First bin has width 16410.0 and height 4.7741589911386953e-05
This corresponds to 16410.0 * 4.7741589911386953e-05 = 78.343949044586% of the data


# Rename the very long column name for convenience
wb = wb.rename(columns={'Antiretroviral therapy coverage: % of people living with HIV: 2015':"HIV rate"})

# With 5 bins, it seems that there is only one peak
sns.histplot(data = wb, x = "HIV rate", stat="density", bins=5)
plt.title("5 histogram bins");


# With 10 bins, there seem to be two peaks

sns.histplot(data = wb, x = 'HIV rate', stat="density", bins=10)
plt.title("10 histogram bins");


# And with 20 bins, it becomes hard to say what counts as a "peak"!

sns.histplot(data = wb, x = 'HIV rate', stat="density", bins=20)
plt.title("20 histogram bins");


# The smooth curve overlaid on the histogram is a KDE
sns.displot(data = wb, x = "HIV rate", kde=True, stat="density");


points = [2.2, 2.8, 3.7, 5.3, 5.7]


plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);


def gaussian(x, z, a):
    # Gaussian kernel
    return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))

def boxcar_basic(x, z, a):
    # Boxcar kernel
    if np.abs(x - z) <= a/2:
        return 1/a
    return 0

def boxcar(x, z, a):
    # Boxcar kernel
    cond = np.abs(x - z)
    return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )


def create_kde(kernel, pts, a):
    # Takes in a kernel, set of points, and alpha
    # Returns the KDE as a function
    def f(x):
        output = 0
        for pt in pts:
            output += kernel(x, pt, a)
        return output / len(pts) # Normalization factor
    return f

def plot_kde(kernel, pts, a):
    # Calls create_kde and plots the corresponding KDE
    f = create_kde(kernel, pts, a)
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    y = [f(xi) for xi in x]
    plt.plot(x, y);
    
def plot_separate_kernels(kernel, pts, a, norm=False):
    # Plots individual kernels, which are then summed to create the KDE
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    for pt in pts:
        y = kernel(x, pt, a)
        if norm:
            y /= len(pts)
        plt.plot(x, y)
    
    plt.show();


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)


sns.kdeplot(points, bw_method=0.65)  # magic value!
sns.histplot(points, stat='density', bins=2);


sns.histplot(points, bins=2, kde=True, stat='density', 
             kde_kws=dict(cut=3, bw_method=0.65));


sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)


tips = sns.load_dataset('tips')


tips.head()


vals = tips['total_bill']


ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);


plt.figure(figsize=(8, 5))
plt.ylim(0, 0.15)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)


plt.scatter(wb['per capita: % growth: 2016'], wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'])
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");


sns.scatterplot(data = wb, hue = "Continent", x = 'per capita: % growth: 2016', \
                y = 'Adult literacy rate: Female: % ages 15 and older: 2005-14')
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");


random_x_noise = np.random.uniform(-1, 1, len(wb))
random_y_noise = np.random.uniform(-5, 5, len(wb))

plt.scatter(wb['per capita: % growth: 2016']+random_x_noise, \
            wb['Adult literacy rate: Female: % ages 15 and older: 2005-14']+random_y_noise, s=15)

plt.xlabel("% growth per capita (jittered)")
plt.ylabel("Female adult literacy rate (jittered)");


sns.lmplot(data = wb, x = 'per capita: % growth: 2016', \
           y = 'Adult literacy rate: Female: % ages 15 and older: 2005-14');


sns.jointplot(data = wb, x = 'per capita: % growth: 2016', \
              y = 'Adult literacy rate: Female: % ages 15 and older: 2005-14');


sns.jointplot(data = wb, x = 'per capita: % growth: 2016', \
              y = 'Adult literacy rate: Female: % ages 15 and older: 2005-14',
              kind = 'hex');


sns.kdeplot(data = wb, x = 'per capita: % growth: 2016', \
              y = 'Adult literacy rate: Female: % ages 15 and older: 2005-14', fill=True);


sns.jointplot(data = wb, x = 'per capita: % growth: 2016', \
              y = 'Adult literacy rate: Female: % ages 15 and older: 2005-14',
              kind = 'kde');


# Some data cleaning to help with the next example

df = pd.DataFrame(index=wb.index)
df['lit'] = wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'] \
            + wb["Adult literacy rate: Male: % ages 15 and older: 2005-14"]
df['inc'] = wb['Gross national income per capita, Atlas method: $: 2016']
df.dropna(inplace=True)

plt.scatter(df["inc"], df["lit"])
plt.xlabel("Gross national income per capita")
plt.ylabel("Adult literacy rate");


# np.log compute the natural (base e) logarithm

plt.scatter(np.log(df["inc"]), df["lit"])
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate");


plt.scatter(np.log(df["inc"]), df["lit"]**4)
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)");


# The code below fits a linear regression model. We'll discuss it at length in a future lecture
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(np.log(df[["inc"]]), df["lit"]**4)
m, b = model.coef_[0], model.intercept_

print(f"The slope, m, of the transformed data is: {m}")
print(f"The intercept, b, of the transformed data is: {b}")

df = df.sort_values("inc")
plt.scatter(np.log(df["inc"]), df["lit"]**4, label="Transformed data")
plt.plot(np.log(df["inc"]), m*np.log(df["inc"])+b, c="red", label="Linear regression")
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)")
plt.legend();

The slope, m, of the transformed data is: 336400693.43172705
The intercept, b, of the transformed data is: -1802204836.0479987


# Now, plug the values for m and b into the relationship between the untransformed x and y
plt.scatter(df["inc"], df["lit"], label="Untransformed data")
plt.plot(df["inc"], (m*np.log(df["inc"])+b)**(1/4), c="red", label="Modeled relationship")
plt.xlabel("Gross national income per capita")
plt.ylabel("Adult literacy rate")
plt.legend();

	Continent	Country	Primary completion rate: Male: % of relevant age group: 2015	Primary completion rate: Female: % of relevant age group: 2015	Lower secondary completion rate: Male: % of relevant age group: 2015	Lower secondary completion rate: Female: % of relevant age group: 2015	Youth literacy rate: Male: % of ages 15-24: 2005-14	Youth literacy rate: Female: % of ages 15-24: 2005-14	Adult literacy rate: Male: % ages 15 and older: 2005-14	Adult literacy rate: Female: % ages 15 and older: 2005-14	...	Access to improved sanitation facilities: % of population: 1990	Access to improved sanitation facilities: % of population: 2015	Child immunization rate: Measles: % of children ages 12-23 months: 2015	Child immunization rate: DTP3: % of children ages 12-23 months: 2015	Children with acute respiratory infection taken to health provider: % of children under age 5 with ARI: 2009-2016	Children with diarrhea who received oral rehydration and continuous feeding: % of children under age 5 with diarrhea: 2009-2016	Children sleeping under treated bed nets: % of children under age 5: 2009-2016	Children with fever receiving antimalarial drugs: % of children under age 5 with fever: 2009-2016	Tuberculosis: Treatment success rate: % of new cases: 2014	Tuberculosis: Cases detection rate: % of new estimated cases: 2015
0	Africa	Algeria	106.0	105.0	68.0	85.0	96.0	92.0	83.0	68.0	...	80.0	88.0	95.0	95.0	66.0	42.0	NaN	NaN	88.0	80.0
1	Africa	Angola	NaN	NaN	NaN	NaN	79.0	67.0	82.0	60.0	...	22.0	52.0	55.0	64.0	NaN	NaN	25.9	28.3	34.0	64.0
2	Africa	Benin	83.0	73.0	50.0	37.0	55.0	31.0	41.0	18.0	...	7.0	20.0	75.0	79.0	23.0	33.0	72.7	25.9	89.0	61.0
3	Africa	Botswana	98.0	101.0	86.0	87.0	96.0	99.0	87.0	89.0	...	39.0	63.0	97.0	95.0	NaN	NaN	NaN	NaN	77.0	62.0
5	Africa	Burundi	58.0	66.0	35.0	30.0	90.0	88.0	89.0	85.0	...	42.0	48.0	93.0	94.0	55.0	43.0	53.8	25.4	91.0	51.0

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Lecture 7 – Data 100, Summer 2023¶

Bar Plots¶

Box Plots and Violin Plots¶

Histograms¶

Kernel Density Estimation¶

Step 1: Place a kernel at each point¶

Step 2: Normalize kernels so that total area is 1¶

Step 3: Sum all kernels together¶

Kernels¶

Effect of bandwidth hyperparameter $\alpha$¶

KDE Formula¶

Scatter Plots¶

Hex Plots¶

Contour Plots¶

Transformations¶