Data 100, Summer 2021
Suraj Rampure, with updates by Fernando Pérez.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme(style='darkgrid', font_scale = 1.5,
rc={'figure.figsize':(7,5)})
#plt.rc('figure', dpi=100, figsize=(7, 5))
#plt.rc('font', size=12)
rng = np.random.default_rng()
ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]),
index=pd.Series([2006, 2013],
name="Year"))
ppdf
Cancer | Abortion | |
---|---|---|
Year | ||
2006 | 2007371 | 289750 |
2013 | 935573 | 327000 |
ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");
Let's now compute the relative change between the two years...
rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change
Cancer -53.39312 Abortion 12.85591 Name: Percent Change, dtype: float64
ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");
cps = pd.read_csv("edInc2.csv")
cps
educ | gender | income | |
---|---|---|---|
0 | 1 | Men | 517 |
1 | 1 | Women | 409 |
2 | 2 | Men | 751 |
3 | 2 | Women | 578 |
4 | 3 | Men | 872 |
5 | 3 | Women | 661 |
6 | 4 | Men | 1249 |
7 | 4 | Women | 965 |
8 | 5 | Men | 1385 |
9 | 5 | Women | 1049 |
cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
5 | <BA | Women | 661 |
6 | BA | Men | 1249 |
7 | BA | Women | 965 |
8 | >BA | Men | 1385 |
9 | >BA | Women | 1049 |
# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
ax = sns.pointplot(data=cps, x = "Education", y = "Income", hue = "Gender")
ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");
Now, let's compute the income gap as a relative quantity between men and women. Recall that the structure of the dataframe is as follows:
cps.head()
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
This calls for using groupby
by Gender, so that we can separate the data for both genders, and then compute the ratio:
cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", "columns")
women = cg.get_group("Women").drop("Gender", "columns")
display(men, women)
Income | |
---|---|
Education | |
<HS | 517 |
HS | 751 |
<BA | 872 |
BA | 1249 |
>BA | 1385 |
Income | |
---|---|
Education | |
<HS | 409 |
HS | 578 |
<BA | 661 |
BA | 965 |
>BA | 1049 |
mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio
Income Ratio (M/F) | |
---|---|
Education | |
<HS | 1.264059 |
HS | 1.299308 |
<BA | 1.319213 |
BA | 1.294301 |
>BA | 1.320305 |
ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");
Let's now compute the alternate ratio, F/M instead:
fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio
Income Ratio (F/M) | |
---|---|
Education | |
<HS | 0.791103 |
HS | 0.769640 |
<BA | 0.758028 |
BA | 0.772618 |
>BA | 0.757401 |
ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");
df = pd.read_csv('baby.csv')
plt.scatter(df['Maternal Height'], df['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.scatter(df['Maternal Height'], df['Birth Weight'], alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.scatter(data=df, x='Maternal Height', y='Birth Weight', alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
r1, r2 = rng.normal(size=(2, len(df)))/3
plt.scatter(df['Maternal Height'] + r1, df['Birth Weight'] + r2, alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
points = [2.2, 2.8, 3.7, 5.3, 5.7]
plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);
Let's define some kernels. We will explain these formulas momentarily. We'll also define some helper functions for visualization purposes.
def gaussian(x, z, a):
# Gaussian kernel
return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))
def boxcar_basic(x, z, a):
# Boxcar kernel
if np.abs(x - z) <= a/2:
return 1/a
return 0
def boxcar(x, z, a):
# Boxcar kernel
cond = np.abs(x - z)
return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )
def create_kde(kernel, pts, a):
# Takes in a kernel, set aof points, and alpha
# Returns the KDE as a function
def f(x):
output = 0
for pt in pts:
output += kernel(x, pt, a)
return output / len(pts) # Normalization factor
return f
def plot_kde(kernel, pts, a):
# Calls create_kde and plots the corresponding KDE
f = create_kde(kernel, pts, a)
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
y = [f(xi) for xi in x]
plt.plot(x, y);
def plot_separate_kernels(kernel, pts, a, norm=False):
# Plots individual kernels, which are then summed to create the KDE
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
for pt in pts:
y = kernel(x, pt, a)
if norm:
y /= len(pts)
plt.plot(x, y)
plt.show();
Here are our five points.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);
We'll start with the Gaussian kernel.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);