Data 100, Fall 2020
Suraj Rampure, with updates by Fernando Pérez.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme(style='darkgrid', font_scale = 1.5,
rc={'figure.figsize':(7,5)})
#plt.rc('figure', dpi=100, figsize=(7, 5))
#plt.rc('font', size=12)
rng = np.random.default_rng()
ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]),
index=pd.Series([2006, 2013],
name="Year"))
ppdf
ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");
Let's now compute the relative change between the two years...
rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change
ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");
cps = pd.read_csv("edInc2.csv")
cps
cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps
# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
ax = sns.pointplot(data=cps, x = "Education", y = "Income", hue = "Gender")
ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");
Now, let's compute the income gap as a relative quantity between men and women. Recall that the structure of the dataframe is as follows:
cps.head()
This calls for using groupby
by Gender, so that we can separate the data for both genders, and then compute the ratio:
cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", "columns")
women = cg.get_group("Women").drop("Gender", "columns")
display(men, women)
mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio
ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");
Let's now compute the alternate ratio, F/M instead:
fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio
ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");
df = pd.read_csv('baby.csv')
plt.scatter(df['Maternal Height'], df['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.scatter(df['Maternal Height'], df['Birth Weight'], alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.scatter(data=df, x='Maternal Height', y='Birth Weight', alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
r1, r2 = rng.normal(size=(2, len(df)))/3
plt.scatter(df['Maternal Height'] + r1, df['Birth Weight'] + r2, alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
points = [2.2, 2.8, 3.7, 5.3, 5.7]
plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);
Let's define some kernels. We will explain these formulas momentarily. We'll also define some helper functions for visualization purposes.
def gaussian(x, z, a):
# Gaussian kernel
return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))
def boxcar_basic(x, z, a):
# Boxcar kernel
if np.abs(x - z) <= a/2:
return 1/a
return 0
def boxcar(x, z, a):
# Boxcar kernel
cond = np.abs(x - z)
return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )
def create_kde(kernel, pts, a):
# Takes in a kernel, set aof points, and alpha
# Returns the KDE as a function
def f(x):
output = 0
for pt in pts:
output += kernel(x, pt, a)
return output / len(pts) # Normalization factor
return f
def plot_kde(kernel, pts, a):
# Calls create_kde and plots the corresponding KDE
f = create_kde(kernel, pts, a)
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
y = [f(xi) for xi in x]
plt.plot(x, y);
def plot_separate_kernels(kernel, pts, a, norm=False):
# Plots individual kernels, which are then summed to create the KDE
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
for pt in pts:
y = kernel(x, pt, a)
if norm:
y /= len(pts)
plt.plot(x, y)
plt.show();
Here are our five points.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);
We'll start with the Gaussian kernel.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)
This looks identical to the smooth curve that sns.distplot
gives us (when we set the appropriate parameter):
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.distplot(points, kde_kws={'bw': 1});
sns.kdeplot(points)
sns.histplot(points, stat='density');
sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');
Gaussian
$$K_{\alpha}(x, x_i) = \frac{1}{\sqrt{2 \pi \alpha^2}} e^{-\frac{(x - x_i)^2}{2\alpha^2}}$$Boxcar
$$K_{\alpha}(x, x_i) = \begin {cases} \frac{1}{\alpha}, \: \: \: |x - x_i| \leq \frac{\alpha}{2}\\ 0, \: \: \: \text{else} \end{cases}$$plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)
Let's bring in some (different) toy data.
tips = sns.load_dataset('tips')
tips.head()
vals = tips['total_bill']
ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.15)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)
co2 = pd.read_csv("CAITcountryCO2.csv", skiprows = 2,
names = ["Country", "Year", "CO2"])
co2.tail()
last_year = co2.Year.iloc[-1]
last_year
q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty
top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()
from cycler import cycler
linestyles = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(color_c * lines_c)
x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
ax.semilogy(df[x], df[y], label=name)
ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);
x = np.linspace(-3, 3)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))
# x
ax1.plot(x, x)
ax1.set_title('$y=x$')
# powers
ax2.plot(x, x**2, label='$x^2$')
ax2.plot(x, x**3, label='$x^3$')
ax2.legend()
ax2.set_title('$y=x^2$, $y=x^3$')
# log
xpos = x[x>0] # Log is only defined for positive x
ax3.plot(xpos, np.log(xpos))
ax3.set_title(r'$y=\log(x)$')
# exp
ax4.plot(x, np.exp(x))
ax4.set_title('$y=e^x$');
plt.tight_layout();
Let's generate data that follows $y = 2x^3 + \epsilon$, where $\epsilon$ is zero-mean noise. Note that given the functional form of $y$, if we simply draw $\epsilon \sim \mathcal{N}(0,1)$, it will be insignificant for higher values of $x$ (in the range we'll look, $[1..10]$). So we will make $\epsilon \sim x^2\mathcal{N}(0,1)$ so that the noise is present for all values of $x$ and $y$.
x = np.linspace(1, 10, 20)
eps = rng.normal(size=len(x))
y = (2+eps)*x**3
y = 2 * x**3 + x**2*eps
plt.scatter(x, y);
The bulge diagram says to raise $x$ to a power, or to take the log of $y$.
First, let's raise $x$ to a power:
plt.scatter(x**2, y);
We used $x^2$ as the transformation. It's better, but still not linear. Let's try $x^3$.
plt.scatter(x**3, y);
That worked well, which makes sense: the original data was cubic in $x$. We can overdo it, too: let's try $x^5$.
plt.scatter(x**5, y);
Now, the data follows some sort of square root relationship. It's certainly not linear; this goes to show that not all power transformations work the same way, and you'll need some experimentation.
Let's instead try taking the log of y from the original data.
plt.scatter(x, np.log(y));
On it's own, this didn't quite work! Since $y = 2x^3$, $\log(y) = \log(2) + 3\log(x)$.
That means we are essentially plotting plt.scatter(x, np.log(x))
, which is not linear.
In order for this to be linear, we need to take the log of $x$ as well:
plt.scatter(np.log(x), np.log(y));
The relationship being visualized now is
$$\log(y) = \log(2) + 3 \log(x)$$Details and data can be found on Wikipedia.
planets = pd.read_csv("planets.data", delim_whitespace=True, comment="#")
planets
ax = sns.regplot(x='mean_dist', y='period', data=planets, fit_reg=False);
ax.set_title('Relation between period and mean distance to the Sun')
ax.set_xlabel('mean distance [AU]')
ax.set_ylabel('period [days]');
ax = sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), fit_reg=False)
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');
In fact, Kepler's law actually states that:
$$ T^2\propto R^3 $$For Kepler this was a data-driven phenomenological law, formulated in 1619. It could only be explained dynamically once Newton introduced his law of universal gravitation in 1687.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.regplot(x='mean_dist', y='period', data=planets, ax=ax1, ci=False);
sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), ax=ax2);
ax2.set_xlabel('Log(mean_dist)')
ax2.set_ylabel('Log(period)')
ax2.relim()
ax2.autoscale_view()
fig.suptitle("Kepler's third law of planetary motion");