import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
wb = pd.read_csv("data/world_bank.csv", index_col=0)
wb = wb.rename(columns={'Antiretroviral therapy coverage: % of people living with HIV: 2015':"HIV rate",
'Gross national income per capita, Atlas method: $: 2016':'gni'})
wb.head()
Continent | Country | Primary completion rate: Male: % of relevant age group: 2015 | Primary completion rate: Female: % of relevant age group: 2015 | Lower secondary completion rate: Male: % of relevant age group: 2015 | Lower secondary completion rate: Female: % of relevant age group: 2015 | Youth literacy rate: Male: % of ages 15-24: 2005-14 | Youth literacy rate: Female: % of ages 15-24: 2005-14 | Adult literacy rate: Male: % ages 15 and older: 2005-14 | Adult literacy rate: Female: % ages 15 and older: 2005-14 | ... | Access to improved sanitation facilities: % of population: 1990 | Access to improved sanitation facilities: % of population: 2015 | Child immunization rate: Measles: % of children ages 12-23 months: 2015 | Child immunization rate: DTP3: % of children ages 12-23 months: 2015 | Children with acute respiratory infection taken to health provider: % of children under age 5 with ARI: 2009-2016 | Children with diarrhea who received oral rehydration and continuous feeding: % of children under age 5 with diarrhea: 2009-2016 | Children sleeping under treated bed nets: % of children under age 5: 2009-2016 | Children with fever receiving antimalarial drugs: % of children under age 5 with fever: 2009-2016 | Tuberculosis: Treatment success rate: % of new cases: 2014 | Tuberculosis: Cases detection rate: % of new estimated cases: 2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Africa | Algeria | 106.0 | 105.0 | 68.0 | 85.0 | 96.0 | 92.0 | 83.0 | 68.0 | ... | 80.0 | 88.0 | 95.0 | 95.0 | 66.0 | 42.0 | NaN | NaN | 88.0 | 80.0 |
1 | Africa | Angola | NaN | NaN | NaN | NaN | 79.0 | 67.0 | 82.0 | 60.0 | ... | 22.0 | 52.0 | 55.0 | 64.0 | NaN | NaN | 25.9 | 28.3 | 34.0 | 64.0 |
2 | Africa | Benin | 83.0 | 73.0 | 50.0 | 37.0 | 55.0 | 31.0 | 41.0 | 18.0 | ... | 7.0 | 20.0 | 75.0 | 79.0 | 23.0 | 33.0 | 72.7 | 25.9 | 89.0 | 61.0 |
3 | Africa | Botswana | 98.0 | 101.0 | 86.0 | 87.0 | 96.0 | 99.0 | 87.0 | 89.0 | ... | 39.0 | 63.0 | 97.0 | 95.0 | NaN | NaN | NaN | NaN | 77.0 | 62.0 |
5 | Africa | Burundi | 58.0 | 66.0 | 35.0 | 30.0 | 90.0 | 88.0 | 89.0 | 85.0 | ... | 42.0 | 48.0 | 93.0 | 94.0 | 55.0 | 43.0 | 53.8 | 25.4 | 91.0 | 51.0 |
5 rows × 47 columns
Often, we want to identify general trends across a distribution, rather than focus on detail. Smoothing a distribution helps generalize the structure of the data and eliminate noise.
Approximate the probability distribution that generated the data by:
points = [2.2, 2.8, 3.7, 5.3, 5.7]
plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);
Let's define some kernels. We will explain these formulas momentarily. We'll also define some helper functions for visualization purposes.
def gaussian(x, z, a):
# Gaussian kernel
return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))
def boxcar_basic(x, z, a):
# Boxcar kernel
if np.abs(x - z) <= a/2:
return 1/a
return 0
def boxcar(x, z, a):
# Boxcar kernel
cond = np.abs(x - z)
return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )
def create_kde(kernel, pts, a):
# Takes in a kernel, set of points, and alpha
# Returns the KDE as a function
def f(x):
output = 0
for pt in pts:
output += kernel(x, pt, a)
return output / len(pts) # Normalization factor
return f
def plot_kde(kernel, pts, a):
# Calls create_kde and plots the corresponding KDE
f = create_kde(kernel, pts, a)
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
y = [f(xi) for xi in x]
plt.plot(x, y);
def plot_separate_kernels(kernel, pts, a, norm=False):
# Plots individual kernels, which are then summed to create the KDE
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
for pt in pts:
y = kernel(x, pt, a)
if norm:
y /= len(pts)
plt.plot(x, y)
plt.show();
Here are our five points.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height=0.5);
We'll start with the Gaussian kernel.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a=1);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a=1, norm=True);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a=1)
sns.kdeplot(points, bw_method=0.65) # Magic value!
sns.histplot(points, stat='density', bins=2);
You can also get a very similar result in a single call by requesting the KDE be added to the histogram, with kde=True
and some extra keywords:
sns.histplot(points, bins=2, kde=True, stat='density',
kde_kws=dict(cut=4, bw_method=0.65));
sns.kdeplot(points)
sns.histplot(points, stat='density');
Gaussian
$$K_{\alpha}(x, x_i) = \frac{1}{\sqrt{2 \pi \alpha^2}} e^{-\frac{(x - x_i)^2}{2\alpha^2}}$$Boxcar
$$K_{\alpha}(x, x_i) = \begin {cases} \frac{1}{\alpha}, \: \: \: |x - x_i| \leq \frac{\alpha}{2}\\ 0, \: \: \: \text{else} \end{cases}$$plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a=1)
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a=1)
vals = wb['HIV rate'].dropna()
ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.25)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a=0.1)
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.05)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a=1)
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.04)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a=2)
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.04)
plt.xlim(-1, 100)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a=5)
displot
¶Seaborn documentation for sns.displot
lets you specify the kind
of plot.
When plotting a histogram, you can pass in histplot
(link) parameters to displot
to specify histogram-specific features.
For example, stat=density
normalizes the histogram such that the area under the histogram is 1.
sns.displot(data=wb,
x="gni",
kind="hist",
stat="density") # default: stat=count and density integrates to 1
plt.title("Distribution of gross national income per capita");
What does it mean to specify kind=kde
? We will explore this!
sns.displot(data=wb,
x="gni",
kind='kde')
plt.title("Distribution of gross national income per capita");
sns.displot(data=wb,
x="gni",
kind='ecdf')
plt.title("Cumulative Distribution of gross national income per capita");
Scatter plots are used to visualize the relationship between two quantitative continuous variables.
plt.scatter(wb['per capita: % growth: 2016'], wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'])
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");
sns.scatterplot(data=wb, x='per capita: % growth: 2016', \
y='Adult literacy rate: Female: % ages 15 and older: 2005-14', hue="Continent")
plt.xlabel("% growth per capita")
plt.ylabel("Female adult literacy rate");
The plots above suffer from overplotting – many scatter points are stacked on top of one another (particularly in the upper right region of the plot).
Jittering is a processed used to address overplotting. A small amount of random noise is added to the x and y values of all datapoints.
Decreasing the size of each scatter point using the s
parameter of plt.scatter
also helps.
random_x_noise = np.random.uniform(-1, 1, len(wb))
random_y_noise = np.random.uniform(-5, 5, len(wb))
plt.scatter(wb['per capita: % growth: 2016']+random_x_noise, \
wb['Adult literacy rate: Female: % ages 15 and older: 2005-14']+random_y_noise, s=15)
plt.xlabel("% growth per capita (jittered)")
plt.ylabel("Female adult literacy rate (jittered)");
sns.lmplot(data=wb, x='per capita: % growth: 2016', \
y='Adult literacy rate: Female: % ages 15 and older: 2005-14');
sns.jointplot(data=wb, x='per capita: % growth: 2016', \
y='Adult literacy rate: Female: % ages 15 and older: 2005-14');
Rather than plot individual datapoints, plot the density of how datapoints are distributed in 2D. A darker hexagon means that more datapoints lie in that region.
sns.jointplot(data=wb, x='per capita: % growth: 2016', \
y='Adult literacy rate: Female: % ages 15 and older: 2005-14',
kind='hex');
Contour plots are similar to topographic maps. Contour lines of the same color have the same density of datapoints. The region with the darkest color contains the most datapoints of all regions.We can think of a contour plot as the 2D equivalent of a KDE curve.
sns.kdeplot(data=wb, x='per capita: % growth: 2016', \
y='Adult literacy rate: Female: % ages 15 and older: 2005-14', fill=True);
sns.jointplot(data=wb, x='per capita: % growth: 2016', \
y='Adult literacy rate: Female: % ages 15 and older: 2005-14',
kind='kde');
Often, our reason for visualizing relationships like we did above is beause we then want to model these relationships. We will start talking about the theory and math underlying modeling processes next week.
We will focus a lot on linear modeling in Data 100. This means that it is often helpful to transform and linearize our data such that it shows roughly a linear relationship. There are a few reasons for this:
# Some data cleaning to help with the next example
df = pd.DataFrame(index=wb.index)
df['lit'] = wb['Adult literacy rate: Female: % ages 15 and older: 2005-14'] \
+ wb["Adult literacy rate: Male: % ages 15 and older: 2005-14"]
df['inc'] = wb['gni']
df.dropna(inplace=True)
plt.scatter(df["inc"], df["lit"])
plt.xlabel("Gross national income per capita")
plt.ylabel("Adult literacy rate");
What is making this plot non-linear?
First, we can transform the x-values such that very large values of x become smaller. This can be achieved by performing a log transformation of the gross national income data. When we take the logarithm of a large number, this number becomes proportionally much smaller relative to its original value. When we take the log of a small number, the number does not change very significantly relative to its starting value.
# np.log compute the natural (base e) logarithm
plt.scatter(np.log(df["inc"]), df["lit"])
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate");
Already, the relationship is starting to look more linear! Now, we'll address the vertical scaling.
To reduce the clumping of datapoints near the top of the plot, we want to spread out large values of y without substantially changing small values of y. We can do this by applying a power transformation – that is, by raising the y-values to a power. Below, we raise all y-values to the power of 4.
plt.scatter(np.log(df["inc"]), df["lit"]**4)
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)");
Our transformed variables now seem to follow a linear relationship!
$$y^4 = m(\log{x}) + b$$We can use this fact to uncover new information about the original, untransformed variables.
$$y = [m(\log{x}) + b]^{1/4}$$In the cell below, we first fit a regression line to the transformed data to find values for the slope ($m$) and intercept ($b$). Then, we plug these values into the relationship we derived for the untransformed variables. We find a mathematical relationship relating the gross national income and the adult literacy rate.
# The code below fits a linear regression model. We'll discuss it at length in a future lecture
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(np.log(df[["inc"]]), df["lit"]**4)
m, b = model.coef_[0], model.intercept_
print(f"The slope, m, of the transformed data is: {m}")
print(f"The intercept, b, of the transformed data is: {b}")
df = df.sort_values("inc")
plt.scatter(np.log(df["inc"]), df["lit"]**4, label="Transformed data")
plt.plot(np.log(df["inc"]), m*np.log(df["inc"])+b, c="red", label="Linear regression")
plt.xlabel("Log(gross national income per capita)")
plt.ylabel("Adult literacy rate (4th power)")
plt.legend();
The slope, m, of the transformed data is: 336400693.43172693 The intercept, b, of the transformed data is: -1802204836.0479977
# Now, plug the values for m and b into the relationship between the untransformed x and y
plt.scatter(df["inc"], df["lit"], label="Untransformed data")
plt.plot(df["inc"], (m*np.log(df["inc"])+b)**(1/4), c="red", label="Modeled relationship")
plt.xlabel("Gross national income per capita")
plt.ylabel("Adult literacy rate")
plt.legend();
We've been able to find a fairly close approximation for the relationship between the original variables!
ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]),
index=pd.Series([2006, 2013],
name="Year"))
ppdf
Cancer | Abortion | |
---|---|---|
Year | ||
2006 | 2007371 | 289750 |
2013 | 935573 | 327000 |
ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");
Let's now compute the relative change between the two years...
rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change
Cancer -53.39312 Abortion 12.85591 Name: Percent Change, dtype: float64
ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");
cps = pd.read_csv("data/edInc2.csv")
cps
educ | gender | income | |
---|---|---|---|
0 | 1 | Men | 517 |
1 | 1 | Women | 409 |
2 | 2 | Men | 751 |
3 | 2 | Women | 578 |
4 | 3 | Men | 872 |
5 | 3 | Women | 661 |
6 | 4 | Men | 1249 |
7 | 4 | Women | 965 |
8 | 5 | Men | 1385 |
9 | 5 | Women | 1049 |
cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
5 | <BA | Women | 661 |
6 | BA | Men | 1249 |
7 | BA | Women | 965 |
8 | >BA | Men | 1385 |
9 | >BA | Women | 1049 |
# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
ax = sns.pointplot(data=cps, x="Education", y="Income", hue="Gender")
ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");
Now, let's compute the income gap as a relative quantity between men and women. Recall that the structure of the dataframe is as follows:
cps.head()
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
This calls for using groupby
by Gender, so that we can separate the data for both genders, and then compute the ratio:
cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", axis="columns")
women = cg.get_group("Women").drop("Gender", axis="columns")
display(men, women)
Income | |
---|---|
Education | |
<HS | 517 |
HS | 751 |
<BA | 872 |
BA | 1249 |
>BA | 1385 |
Income | |
---|---|
Education | |
<HS | 409 |
HS | 578 |
<BA | 661 |
BA | 965 |
>BA | 1049 |
mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio
Income Ratio (M/F) | |
---|---|
Education | |
<HS | 1.264059 |
HS | 1.299308 |
<BA | 1.319213 |
BA | 1.294301 |
>BA | 1.320305 |
ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");
Let's now compute the alternate ratio, F/M instead:
fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio
Income Ratio (F/M) | |
---|---|
Education | |
<HS | 0.791103 |
HS | 0.769640 |
<BA | 0.758028 |
BA | 0.772618 |
>BA | 0.757401 |
ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");
co2 = pd.read_csv("data/CAITcountryCO2.csv", skiprows=2,
names=["Country", "Year", "CO2"], encoding="ISO-8859-1")
co2.tail()
Country | Year | CO2 | |
---|---|---|---|
30639 | Vietnam | 2012 | 173.0497 |
30640 | World | 2012 | 33843.0497 |
30641 | Yemen | 2012 | 20.5386 |
30642 | Zambia | 2012 | 2.7600 |
30643 | Zimbabwe | 2012 | 9.9800 |
last_year = co2.Year.iloc[-1]
last_year
2012
q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty
Country | Year | CO2 | |
---|---|---|---|
30490 | China | 2012 | 9312.5329 |
30634 | United States | 2012 | 5122.9094 |
30514 | European Union (28) | 2012 | 3610.5137 |
30533 | India | 2012 | 2075.1808 |
30596 | Russian Federation | 2012 | 1721.5376 |
30541 | Japan | 2012 | 1249.2135 |
30521 | Germany | 2012 | 773.9585 |
30547 | Korea, Rep. (South) | 2012 | 617.2418 |
30535 | Iran | 2012 | 593.8195 |
30485 | Canada | 2012 | 543.0242 |
30603 | Saudi Arabia | 2012 | 480.2278 |
30478 | Brazil | 2012 | 477.7701 |
30633 | United Kingdom | 2012 | 463.4556 |
30567 | Mexico | 2012 | 460.4782 |
top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()
14
Country | Year | CO2 | |
---|---|---|---|
18822 | Brazil | 1950 | 19.6574 |
18829 | Canada | 1950 | 154.1408 |
18834 | China | 1950 | 78.6478 |
18858 | European Union (28) | 1950 | 1773.6864 |
18865 | Germany | 1950 | 510.7323 |
from cycler import cycler
linestyles = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(color_c * lines_c)
x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
ax.semilogy(df[x], df[y], label=name)
ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);