import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.read_csv('baby.csv')
plt.figure(figsize=(8, 8))
plt.scatter(df['Maternal Height'], df['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.figure(figsize=(8, 8))
plt.scatter(df['Maternal Height'], df['Birth Weight'], alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.figure(figsize=(8, 8))
r1 = np.random.randn(len(df))/3
r2 = np.random.randn(len(df))/3
plt.scatter(df['Maternal Height'] + r1, df['Birth Weight'] + r2, alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
points = [2.2, 2.8, 3.7, 5.3, 5.7]
plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);
Let's define some kernels. We will explain these formulas momentarily. We'll also define some helper functions for visualization purposes.
def gaussian(x, z, a):
# Gaussian kernel
return (1/np.sqrt(2*np.pi*a**2)) * np.e ** (-(x - z)**2 / (2 * a**2))
def boxcar(x, z, a):
# Boxcar kernel
if np.abs(x - z) <= a/2:
return 1/a
return 0
def create_kde(kernel, pts, a):
# Takes in a kernel, set of points, and alpha
# Returns the KDE as a function
def f(x):
output = 0
for pt in pts:
output += kernel(x, pt, a)
return output / len(pts) # Normalization factor
return f
def plot_kde(kernel, pts, a):
# Calls create_kde and plots the corresponding KDE
f = create_kde(kernel, pts, a)
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
y = [f(xi) for xi in x]
plt.plot(x, y);
def plot_separate_kernels(kernel, pts, a, norm=False):
# Plots individual kernels, which are then summed to create the KDE
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
for pt in pts:
if norm:
y = [(1/len(pts)) * kernel(xi, pt, a) for xi in x]
else:
y = [kernel(xi, pt, a) for xi in x]
plt.plot(x, y)
plt.show();
Here are our five points.
plt.figure(figsize=(8, 5))
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);
We'll start with the Gaussian kernel.
plt.figure(figsize=(8, 5))
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);
plt.figure(figsize=(8, 5))
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);
plt.figure(figsize=(8, 5))
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)
This looks identical to the smooth curve that sns.distplot
gives us (when we set the appropriate parameter):
plt.figure(figsize=(8, 5))
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.distplot(points, kde_kws={'bw': 1});
Gaussian
$$K_{\alpha}(x, x_i) = \frac{1}{\sqrt{2 \pi \alpha^2}} e^{-\frac{(x - x_i)^2}{2\alpha^2}}$$Boxcar
$$K_{\alpha}(x, x_i) = \begin {cases} \frac{1}{\alpha}, \: \: \: |x - x_i| \leq \frac{\alpha}{2}\\ 0, \: \: \: \text{else} \end{cases}$$plt.figure(figsize=(8, 5))
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)
plt.figure(figsize=(8, 5))
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)
Let's bring in some (different) toy data.
tips = sns.load_dataset('tips')
tips.head()
vals = tips['total_bill']
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.15)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)
Let's generate data that follows $y = 2x^3$.
x = np.array([t + np.random.random() for t in np.linspace(1, 10, 20)])
y = 2*x**3
plt.scatter(x, y);
The bulge diagram says to raise $x$ to a power, or to take the log of $y$.
First, let's raise $x$ to a power:
plt.scatter(x**2, y);
We used $x^2$ as the transformation. It's better, but still not linear. Let's try $x^3$.
plt.scatter(x**3, y);
That worked well, which makes sense: the original data was cubic in $x$. We can overdo it, too: let's try $x^5$.
plt.scatter(x**5, y);
Now, the data follows some sort of square root relationship. It's certainly not linear; this goes to show that not all power transformations work the same way, and you'll need some experimentation.
Let's instead try taking the log of y from the original data.
plt.scatter(x, np.log(y));
On it's own, this didn't quite work! Since $y = 2x^3$, $\log(y) = \log(2) + 3\log(x)$.
That means we are essentially plotting plt.scatter(x, np.log(x))
, which is not linear.
In order for this to be linear, we need to take the log of $x$ as well:
plt.scatter(np.log(x), np.log(y));
The relationship being visualized now is
$$\log(y) = \log(2) + 3 \log(x)$$