import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

wb = pd.read_csv("data/world_bank.csv", index_col=0)
wb.head()

wb["Continent"].value_counts()

Continent
Africa        47
Europe        43
Asia          34
N. America    18
Oceania       13
S. America    11
Name: count, dtype: int64

from datascience import Table
t = Table.from_df(wb["Continent"].value_counts().reset_index())
t.barh("index", "Continent")

# Count the number of countries in each continent
continents = wb["Continent"].value_counts()

# Create a bar plot for the distribution of countries across the continents
plt.bar(continents.index, continents.values)

# Set the x-axis label
plt.xlabel("Continents")

# Set the y-axis label
plt.ylabel("Counts")

# Set the title of the plot
plt.title("Distribution of countries across the continents");

wb["Continent"].value_counts().plot(kind='bar')
plt.ylabel("Counts")
plt.title("Distribution of countries across the continents");

# Create a count plot for the 'Continent' column in the World Bank dataset
sns.countplot(data=wb, x='Continent', hue='Continent')

# Set the title of the plot
plt.title("Distribution of countries across the continents");

sns.countplot(data=wb, x='Gross national income per capita, Atlas method: $: 2016')
plt.title("GNI distribution for different countries");

sns.boxplot(data=wb, y="Gross national income per capita, Atlas method: $: 2016")
plt.title("The distribution of GNI per capita in different countries");

sns.violinplot(data=wb, y="Gross national income per capita, Atlas method: $: 2016")
plt.title("The distribution of GNI per capita in different countries");

gdp = wb['Gross domestic product: % growth : 2016']
gdp = gdp[~gdp.isna()]

q1, q2, q3 = np.percentile(gdp, [25, 50, 75])

wb_quartiles = wb.copy()
wb_quartiles['category'] = None
wb_quartiles.loc[(wb_quartiles['Gross domestic product: % growth : 2016'] < q1) | (wb_quartiles['Gross domestic product: % growth : 2016'] > q3), 'category'] = 'Outside of the middle 50%'
wb_quartiles.loc[(wb_quartiles['Gross domestic product: % growth : 2016'] > q1) & (wb_quartiles['Gross domestic product: % growth : 2016'] < q3), 'category'] = 'In the middle 50%'

sns.histplot(wb_quartiles, x="Gross domestic product: % growth : 2016", hue="category")
sns.rugplot([q1, q2, q3], c="firebrick", lw=6, height=0.1)
plt.title("The distribution of GNI per capita with the middle 50% highlighted in blue");

sns.boxplot(data=wb, y='Gross domestic product: % growth : 2016')
plt.title("The distribution of gross domestic product: % growth");

sns.violinplot(data=wb, y='Gross domestic product: % growth : 2016')
plt.title("The distribution of gross domestic product: % growth");

sns.boxplot(data=wb, x="Continent", y='Gross domestic product: % growth : 2016', hue="Continent")
plt.title("The distribution of gross domestic product for different continents");

# The `edgecolor` argument controls the color of the bin edges
gni = wb["Gross national income per capita, Atlas method: $: 2016"]
plt.hist(gni, density=True, edgecolor="white")

# Add labels
plt.xlabel("Gross national income per capita")
plt.ylabel("Density")
plt.title("Distribution of gross national income per capita");

sns.histplot(data=wb, x="Gross national income per capita, Atlas method: $: 2016", stat="density")
plt.title("Distribution of gross national income per capita");

# Create a new variable to store the hemisphere in which each country is located
north = ["Asia", "Europe", "N. America"]
south = ["Africa", "Oceania", "S. America"]
wb.loc[wb["Continent"].isin(north), "Hemisphere"] = "Northern"
wb.loc[wb["Continent"].isin(south), "Hemisphere"] = "Southern"

sns.histplot(data=wb, x="Gross national income per capita, Atlas method: $: 2016", hue="Hemisphere", stat="density")
plt.title("Distribution of gross national income per capita highlighted for different hemispheres");

densities, bins, _ = plt.hist(gni, density=True, edgecolor="white", bins=5)
plt.xlabel("Gross national income per capita")
plt.ylabel("Density")
plt.title("A histogram of the distribution of GNI per capita");

print(f"First bin has width {bins[1]-bins[0]} and height {densities[0]}")
print(f"This corresponds to {bins[1]-bins[0]} * {densities[0]} = {(bins[1]-bins[0])*densities[0]*100}% of the data")

First bin has width 16410.0 and height 4.7741589911386953e-05
This corresponds to 16410.0 * 4.7741589911386953e-05 = 78.343949044586% of the data

# Rename the very long column name for convenience
wb = wb.rename(columns={'Antiretroviral therapy coverage: % of people living with HIV: 2015':"HIV rate"})

# With 5 bins, it seems that there is only one peak
sns.histplot(data=wb, x="HIV rate", stat="density", bins=5)
plt.title("5 histogram bins");

# With 10 bins, there seem to be two peaks

sns.histplot(data=wb, x="HIV rate", stat="density", bins=10)
plt.title("10 histogram bins");

# And with 20 bins, it becomes hard to say what counts as a "peak"!

sns.histplot(data=wb, x ="HIV rate", stat="density", bins=20)
plt.title("20 histogram bins");

# The smooth curve overlaid on the histogram is a KDE
sns.displot(data=wb, x="HIV rate", kde=True, stat="density")
plt.title("Histogram and overlaid KDE on HIV rate distribution");

points = [2.2, 2.8, 3.7, 5.3, 5.7]

plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);

def gaussian(x, z, a):
    # Gaussian kernel
    return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))

def boxcar_basic(x, z, a):
    # Boxcar kernel
    if np.abs(x - z) <= a/2:
        return 1/a
    return 0

def boxcar(x, z, a):
    # Boxcar kernel
    cond = np.abs(x - z)
    return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )

def create_kde(kernel, pts, a):
    # Takes in a kernel, set of points, and alpha
    # Returns the KDE as a function
    def f(x):
        output = 0
        for pt in pts:
            output += kernel(x, pt, a)
        return output / len(pts) # Normalization factor
    return f

def plot_kde(kernel, pts, a):
    # Calls create_kde and plots the corresponding KDE
    f = create_kde(kernel, pts, a)
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    y = [f(xi) for xi in x]
    fig, ax = plt.subplots()
    ax.plot(x, y)
    return fig, ax
    
def plot_separate_kernels(kernel, pts, a, norm=False):
    # Plots individual kernels, which are then summed to create the KDE
    fig, ax = plt.subplots()
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    for pt in pts:
        y = kernel(x, pt, a)
        if norm:
            y /= len(pts)
        ax.plot(x, y)
    return fig, ax

plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5)
plt.title("sample dataset");

fig, ax = plot_separate_kernels(gaussian, points, a=1)
ax.set_title("Overlaid Gaussians on each data point")
ax.set_xlim(-3, 10)
ax.set_ylim(0, 0.5);

fig, ax = plot_separate_kernels(gaussian, points, a=1, norm=True)
ax.set_title("Normalized verlaid Gaussians on each data point")
ax.set_xlim(-3, 10)
ax.set_ylim(0, 0.5);

fig, ax = plot_kde(gaussian, points, a=1)
ax.set_title("KDE estimate")
ax.set_xlim(-3, 10)
ax.set_ylim(0, 0.5);

sns.kdeplot(points, bw_method=0.65)  # magic value!
sns.histplot(points, stat='density', bins=2);

sns.histplot(points, bins=2, kde=True, stat='density', 
             kde_kws=dict(cut=3, bw_method=0.65));

sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');

fig, ax = plot_kde(gaussian, points, a=1)
ax.set_title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
ax.set_xlim(-3, 10)
ax.set_ylim(0, 0.5);

fig, ax = plot_kde(boxcar, points, a=1)
ax.set_title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
ax.set_xlim(-3, 10)
ax.set_ylim(0, 0.5);

tips = sns.load_dataset('tips')

tips.head()

vals = tips['total_bill']

ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);

fig, ax = plot_kde(gaussian, vals, a=0.1)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plt.ylim(0, 0.15);

fig, ax = plot_kde(gaussian, vals, a=1)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
ax.set_ylim(0, 0.1);

fig, ax = plot_kde(gaussian, vals, a=2)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
ax.set_ylim(0, 0.1);

fig, ax = plot_kde(gaussian, vals, a=5)
ax.set_title(r'KDE of tips with Gaussian kernel and $\alpha$ = 5')
ax.set_ylim(0, 0.1);

	Continent	Country	Primary completion rate: Male: % of relevant age group: 2015	Primary completion rate: Female: % of relevant age group: 2015	Lower secondary completion rate: Male: % of relevant age group: 2015	Lower secondary completion rate: Female: % of relevant age group: 2015	Youth literacy rate: Male: % of ages 15-24: 2005-14	Youth literacy rate: Female: % of ages 15-24: 2005-14	Adult literacy rate: Male: % ages 15 and older: 2005-14	Adult literacy rate: Female: % ages 15 and older: 2005-14	...	Access to improved sanitation facilities: % of population: 1990	Access to improved sanitation facilities: % of population: 2015	Child immunization rate: Measles: % of children ages 12-23 months: 2015	Child immunization rate: DTP3: % of children ages 12-23 months: 2015	Children with acute respiratory infection taken to health provider: % of children under age 5 with ARI: 2009-2016	Children with diarrhea who received oral rehydration and continuous feeding: % of children under age 5 with diarrhea: 2009-2016	Children sleeping under treated bed nets: % of children under age 5: 2009-2016	Children with fever receiving antimalarial drugs: % of children under age 5 with fever: 2009-2016	Tuberculosis: Treatment success rate: % of new cases: 2014	Tuberculosis: Cases detection rate: % of new estimated cases: 2015
0	Africa	Algeria	106.0	105.0	68.0	85.0	96.0	92.0	83.0	68.0	...	80.0	88.0	95.0	95.0	66.0	42.0	NaN	NaN	88.0	80.0
1	Africa	Angola	NaN	NaN	NaN	NaN	79.0	67.0	82.0	60.0	...	22.0	52.0	55.0	64.0	NaN	NaN	25.9	28.3	34.0	64.0
2	Africa	Benin	83.0	73.0	50.0	37.0	55.0	31.0	41.0	18.0	...	7.0	20.0	75.0	79.0	23.0	33.0	72.7	25.9	89.0	61.0
3	Africa	Botswana	98.0	101.0	86.0	87.0	96.0	99.0	87.0	89.0	...	39.0	63.0	97.0	95.0	NaN	NaN	NaN	NaN	77.0	62.0
5	Africa	Burundi	58.0	66.0	35.0	30.0	90.0	88.0	89.0	85.0	...	42.0	48.0	93.0	94.0	55.0	43.0	53.8	25.4	91.0	51.0

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Lecture 7 – Data 100, Spring 2025¶

Bar Plots¶

Bar plot using `matplotlib`¶

`pandas` native bar plot¶

Bar plot using `seaborn`¶

Box Plots and Violin Plots¶

Box Plots¶

Violin Plots¶

Histograms¶

Mode in Histograms¶

Kernel Density Estimation¶

How KDE Works¶

Steps to Construct a KDE¶

KDE Formula¶

Step 1: Place a kernel at each point¶

Step 2: Normalize kernels so that total area is 1¶

Step 3: Sum all kernels together¶

Kernels¶

Gaussian¶

Boxcar¶

Effect of Bandwidth Hyperparameter $\alpha$¶

Lecture 7 – Data 100, Spring 2025¶

Bar Plots¶

Bar plot using matplotlib¶

pandas native bar plot¶

Bar plot using seaborn¶

Box Plots and Violin Plots¶

Box Plots¶

Violin Plots¶

Histograms¶

Mode in Histograms¶

Kernel Density Estimation¶

How KDE Works¶

Steps to Construct a KDE¶

KDE Formula¶

Step 1: Place a kernel at each point¶

Step 2: Normalize kernels so that total area is 1¶

Step 3: Sum all kernels together¶

Kernels¶

Gaussian¶

Boxcar¶

Effect of Bandwidth Hyperparameter $\alpha$¶

Bar plot using `matplotlib`¶

`pandas` native bar plot¶

Bar plot using `seaborn`¶