import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
np.random.seed(42)
x = np.hstack([
np.random.randn(10)*1.4 - 3,
np.random.randn(20)*2 + 7
]) + 90
Plots of descriptive statistics communicate key summaries of the data. They do not present those summaries in a way that attempts to infer values that were not observed.
The following rug plot and box plot show key summaries of the data. While the box plot does communicate the spread it only reveals the location of the median and quartiles.
sns.rugplot(x)
sns.boxplot(x)
plt.ylabel("No Meaning")
plt.xlabel(r"$x$")
plt.savefig("descriptive_plot.pdf", bbox_inches='tight')
Histograms and Kernel density estimators attempt to convey information about the shape of the data and in the process convey information about regions where data is not observed. As a consequence the convey an inference about the shape of the data generating process:
sns.distplot(x, rug=True)
plt.xlabel(r"$x$")
plt.ylabel(r"$\mathbf{p}(x)$")
plt.savefig("everything.pdf", bbox_inches='tight')
The Kernel density estimate is a non-parametric estimator for the distribution of the population based on samples. It is constructed by averaging densities centered at each observed data point.
import scipy as sp
u = np.linspace(np.min(x)-5, np.max(x)+5, 100)
z = np.zeros(u.shape)
h = 2
for v in x:
y = sp.stats.norm.pdf((u - v), scale=h)
plt.plot(u,y, 'y')
z += y
plt.plot(u, z /len(x) , 'k', linewidth=5)
sns.rugplot(x)
plt.xlabel(r"$x$")
plt.savefig("sum_of_bumps.pdf", bbox_inches='tight')
There are a range of possible kernel functions that can be used in a kernel density estimator. The choice of function depends on your assumptions about the smoothness of the data. The Gaussian Kernel is widely used because it is simple to understand and results in very smooth curves. Conversely the boxcar kernel (square function) results in very jagged curves that more closely approximate a histogram. The Epanechnikov kernel enjoys some additional statistical guarantees and is a reasonable compromise between Guassian and Boxcar kernels.
import scipy as sp
u = np.linspace(-10, +10, 200)
plt.plot(u, sp.stats.norm.pdf(u), label=r"Gaussian $\alpha = 1$")
plt.plot(u, sp.stats.norm.pdf(u, scale=2), label=r"Gaussian $\alpha = 2$")
plt.plot(u, sp.stats.norm.pdf(u, scale=3), label=r"Gaussian $\alpha = 3$")
plt.plot(u, ((u > -1) & (u < 1))/2., label=r"Boxcar $\alpha = 2$")
plt.plot(u, ((u > -1) & (u < 1)) * (3./4. * (1 - u**2)),
label="\nEpanechnikov \n $\\frac{3}{4}(1-r^2)$", linewidth=3)
# plt.setp(plt.gca(), fontsize=10)
# sns.rugplot(x)
plt.legend(fontsize=12)
plt.xlabel(r"$r$")
plt.savefig("kernels.pdf", bbox_inches='tight')
from sklearn.neighbors.kde import KernelDensity
for bw in [1,2,3,5]:
kde = KernelDensity(kernel="epanechnikov",bandwidth=bw)
kde.fit(x[:,np.newaxis])
u = np.linspace(np.min(x)-5, np.max(x)+5, 200)
plt.plot(u, np.exp(kde.score_samples(u[:, np.newaxis])), label=r"$\alpha=" + str(bw) + "$")
plt.legend()
plt.title("Epanechnikov KDE at Different Bandwidths");
plt.xlabel(r"$x$")
plt.savefig("epanechnikov_kde.pdf", bbox_inches='tight')
np.random.seed(43)
x2 = np.random.randn(5000)*2
y = 1.5*np.sin(x2) + 2*np.random.randn(len(x2))
plt.plot(x2,y, "o")
plt.savefig("overplotted.png", bbox_inches='tight', dpi=300)
from sklearn.kernel_ridge import KernelRidge
kreg = KernelRidge(kernel="rbf", alpha=10)
kreg.fit(x2[:, np.newaxis], y)
# Make Plot
u = np.linspace(x2.min(), x2.max(), 300)
plt.plot(x2,y, "o", alpha=0.1)
plt.plot(u, kreg.predict(u[:, np.newaxis]))
plt.savefig("kernel_smoothed_alpha.png", bbox_inches='tight', dpi=300)
u = np.linspace(x2.min()-1, x2.max()+1, 300)
z = sp.stats.norm.pdf(u-2, scale=0.4)
plt.plot(x2,y, "o", alpha=0.1)
plt.plot(u, kreg.predict(u[:, np.newaxis]))
plt.plot(u, sp.stats.norm.pdf(u-2, scale=0.4)*3, label=r"Gaussian", color=[0.5,1.0,.5])
plt.plot([2],[0], 'xk')
plt.savefig("kernel_smoothed_alpha2.png", bbox_inches='tight', dpi=300)
u = np.linspace(x2.min()-1, x2.max()+1, 300)
colors = np.zeros((len(x2), 4))
colors[:,3] = np.min([(sp.stats.norm.pdf(x2-2, scale=0.4))/4 + 0.01, np.ones(y.shape)-0.2], axis=0)
colors[:,2] = 0.8
plt.scatter(x2,y, color=colors)
plt.plot(u, kreg.predict(u[:, np.newaxis]))
plt.plot(u, sp.stats.norm.pdf(u-2, scale=0.4)*3, label=r"Gaussian", color=[0.5,1.0,.5])
plt.plot([2],[0], 'xk')
plt.savefig("kernel_smoothed_alpha2.png", bbox_inches='tight', dpi=300)