import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#plt.style.use('fivethirtyeight')
sns.set_theme(style='darkgrid', font_scale = 1.5,
rc={'figure.figsize':(7,5)})
displot
¶Seaborn documentation for sns.displot
lets you specify the kind
of plot.
When plotting a histogram, you can pass in histplot
(link) parameters to displot
to specify histogram-specific features.
For example, stat=density
normalizes the histogram such that the area under the histogram is 1.
titanic = sns.load_dataset('titanic')
# sns.displot(titanic["age"], kind="hist"); # default: stat=count
sns.displot(titanic["age"], kind="hist", stat='density'); # density integrates to 1
What does it mean to specify kind=kde
? We will explore this!
titanic = sns.load_dataset('titanic')
sns.displot(titanic["age"], kind = "kde");
# to save the above figure, copy this line to the above cell
# plt.savefig("titanic_displot.png", bbox_inches = "tight", dpi = 300)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
points = [2.2, 2.8, 3.7, 5.3, 5.7]
plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);
Let's define some kernels. We will explain these formulas momentarily. We'll also define some helper functions for visualization purposes.
def gaussian(x, z, a):
# Gaussian kernel
return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))
def boxcar_basic(x, z, a):
# Boxcar kernel
if np.abs(x - z) <= a/2:
return 1/a
return 0
def boxcar(x, z, a):
# Boxcar kernel
cond = np.abs(x - z)
return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )
def create_kde(kernel, pts, a):
# Takes in a kernel, set aof points, and alpha
# Returns the KDE as a function
def f(x):
output = 0
for pt in pts:
output += kernel(x, pt, a)
return output / len(pts) # Normalization factor
return f
def plot_kde(kernel, pts, a):
# Calls create_kde and plots the corresponding KDE
f = create_kde(kernel, pts, a)
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
y = [f(xi) for xi in x]
plt.plot(x, y);
def plot_separate_kernels(kernel, pts, a, norm=False):
# Plots individual kernels, which are then summed to create the KDE
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
for pt in pts:
y = kernel(x, pt, a)
if norm:
y /= len(pts)
plt.plot(x, y)
plt.show();
Here are our five points.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);
We'll start with the Gaussian kernel.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)
This looks identical to the smooth curve that sns.distplot
gives us (when we set the appropriate parameter):
sns.kdeplot(points, bw_method=0.65) # magic value!
sns.histplot(points, stat='density', bins=2);
You can also get a very similar result in a single call by requesting the KDE be added to the histogram, with kde=True
and some extra keywords:
sns.histplot(points, bins=2, kde=True, stat='density',
kde_kws=dict(cut=3, bw_method=0.65));
sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');
Gaussian
$$K_{\alpha}(x, x_i) = \frac{1}{\sqrt{2 \pi \alpha^2}} e^{-\frac{(x - x_i)^2}{2\alpha^2}}$$Boxcar
$$K_{\alpha}(x, x_i) = \begin {cases} \frac{1}{\alpha}, \: \: \: |x - x_i| \leq \frac{\alpha}{2}\\ 0, \: \: \: \text{else} \end{cases}$$plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)
Let's bring in some (different) toy data.
tips = sns.load_dataset('tips')
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
vals = tips['total_bill']
ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.15)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)
births = pd.read_csv('data/baby.csv')
births.head()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | |
---|---|---|---|---|---|---|
0 | 120 | 284 | 27 | 62 | 100 | False |
1 | 113 | 282 | 33 | 64 | 135 | False |
2 | 128 | 279 | 28 | 64 | 115 | True |
3 | 108 | 282 | 23 | 67 | 125 | True |
4 | 136 | 286 | 25 | 62 | 93 | False |
plt.scatter(births['Maternal Height'], births['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
Most matplotlib
functions also accept a data=
keyword, and when using this mode, you can then refer to x and y as names of columns in the data
DataFrame, instead of passing the series explicitly:
plt.scatter(data = births, x = 'Maternal Height', y = 'Birth Weight');
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
This is the seaborn version:
sns.scatterplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');
births['Maternal Height (jittered)'] = births['Maternal Height'] + np.random.uniform(-0.2, 0.2, len(births))
fig = sns.scatterplot(data = births, x = 'Maternal Height (jittered)', y = 'Birth Weight', hue = 'Maternal Smoker');
sns.lmplot(data = births, x = 'Maternal Height', y = 'Birth Weight',
ci = False, hue = 'Maternal Smoker');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'hex');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'kde', fill = True);
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'kde', hue = 'Maternal Smoker');
Details and data can be found on Wikipedia.
planets = pd.read_csv("data/planets.data", delim_whitespace=True, comment="#")
planets
planet | mean_dist | period | kepler_ratio | |
---|---|---|---|---|
0 | Mercury | 0.389 | 87.77 | 7.64 |
1 | Venus | 0.724 | 224.70 | 7.52 |
2 | Earth | 1.000 | 365.25 | 7.50 |
3 | Mars | 1.524 | 686.95 | 7.50 |
4 | Jupiter | 5.200 | 4332.62 | 7.49 |
5 | Saturn | 9.510 | 10759.20 | 7.43 |
Returning the ax
Matplotlib axis object lets us specify the titles and labels manually.
ax = sns.scatterplot(x='mean_dist', y='period', data=planets);
ax.set_title('Relation between period and mean distance to the Sun')
ax.set_xlabel('mean distance [AU]')
ax.set_ylabel('period [days]');
The data appears to curve up, i.e. is not linear. We can verify this by trying the regplot
function, which fits a line to the data (more on this function in a later lecture).
sns.regplot(x='mean_dist', y='period', data=planets, ci=False);
However, if we plot the log-log relationship, we see a very linear looking relationship.
ax = sns.scatterplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']))
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');
This time if we use regplot
, the plot looks almost perfect.
ax = sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), ci=False)
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');
The slope of this line is approximately (9 - 6) / 2 = 1.5.
This suggests that the relationship between $T$ and $R$ is: $$T \propto R^{3/2}.$$
Kepler's third law actually states this in a slightly different format by squaring both sides:
$$ T^2\propto R^3 $$For Kepler this was a data-driven phenomenological law, formulated in 1619. It could only be explained dynamically once Newton introduced his law of universal gravitation in 1687.
ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]),
index=pd.Series([2006, 2013],
name="Year"))
ppdf
Cancer | Abortion | |
---|---|---|
Year | ||
2006 | 2007371 | 289750 |
2013 | 935573 | 327000 |
ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");
Let's now compute the relative change between the two years...
rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change
Cancer -53.39312 Abortion 12.85591 Name: Percent Change, dtype: float64
ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");
cps = pd.read_csv("data/edInc2.csv")
cps
educ | gender | income | |
---|---|---|---|
0 | 1 | Men | 517 |
1 | 1 | Women | 409 |
2 | 2 | Men | 751 |
3 | 2 | Women | 578 |
4 | 3 | Men | 872 |
5 | 3 | Women | 661 |
6 | 4 | Men | 1249 |
7 | 4 | Women | 965 |
8 | 5 | Men | 1385 |
9 | 5 | Women | 1049 |
cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
5 | <BA | Women | 661 |
6 | BA | Men | 1249 |
7 | BA | Women | 965 |
8 | >BA | Men | 1385 |
9 | >BA | Women | 1049 |
# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
ax = sns.pointplot(data=cps, x = "Education", y = "Income", hue = "Gender")
ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");
Now, let's compute the income gap as a relative quantity between men and women. Recall that the structure of the dataframe is as follows:
cps.head()
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
This calls for using groupby
by Gender, so that we can separate the data for both genders, and then compute the ratio:
cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", axis="columns")
women = cg.get_group("Women").drop("Gender", axis="columns")
display(men, women)
Income | |
---|---|
Education | |
<HS | 517 |
HS | 751 |
<BA | 872 |
BA | 1249 |
>BA | 1385 |
Income | |
---|---|
Education | |
<HS | 409 |
HS | 578 |
<BA | 661 |
BA | 965 |
>BA | 1049 |
mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio
Income Ratio (M/F) | |
---|---|
Education | |
<HS | 1.264059 |
HS | 1.299308 |
<BA | 1.319213 |
BA | 1.294301 |
>BA | 1.320305 |
ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");
Let's now compute the alternate ratio, F/M instead:
fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio
Income Ratio (F/M) | |
---|---|
Education | |
<HS | 0.791103 |
HS | 0.769640 |
<BA | 0.758028 |
BA | 0.772618 |
>BA | 0.757401 |
ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");
co2 = pd.read_csv("data/CAITcountryCO2.csv", skiprows = 2,
names = ["Country", "Year", "CO2"], encoding = "ISO-8859-1")
co2.tail()
Country | Year | CO2 | |
---|---|---|---|
30639 | Vietnam | 2012 | 173.0497 |
30640 | World | 2012 | 33843.0497 |
30641 | Yemen | 2012 | 20.5386 |
30642 | Zambia | 2012 | 2.7600 |
30643 | Zimbabwe | 2012 | 9.9800 |
last_year = co2.Year.iloc[-1]
last_year
2012
q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty
Country | Year | CO2 | |
---|---|---|---|
30490 | China | 2012 | 9312.5329 |
30634 | United States | 2012 | 5122.9094 |
30514 | European Union (28) | 2012 | 3610.5137 |
30533 | India | 2012 | 2075.1808 |
30596 | Russian Federation | 2012 | 1721.5376 |
30541 | Japan | 2012 | 1249.2135 |
30521 | Germany | 2012 | 773.9585 |
30547 | Korea, Rep. (South) | 2012 | 617.2418 |
30535 | Iran | 2012 | 593.8195 |
30485 | Canada | 2012 | 543.0242 |
30603 | Saudi Arabia | 2012 | 480.2278 |
30478 | Brazil | 2012 | 477.7701 |
30633 | United Kingdom | 2012 | 463.4556 |
30567 | Mexico | 2012 | 460.4782 |
top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()
14
Country | Year | CO2 | |
---|---|---|---|
18822 | Brazil | 1950 | 19.6574 |
18829 | Canada | 1950 | 154.1408 |
18834 | China | 1950 | 78.6478 |
18858 | European Union (28) | 1950 | 1773.6864 |
18865 | Germany | 1950 | 510.7323 |
from cycler import cycler
linestyles = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(color_c * lines_c)
x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
ax.semilogy(df[x], df[y], label=name)
ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);