Data 100, Summer 2021
Suraj Rampure, with updates by Fernando Pérez.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_theme(style='darkgrid', font_scale = 1.5,
rc={'figure.figsize':(7,5)})
#plt.rc('figure', dpi=100, figsize=(7, 5))
#plt.rc('font', size=12)
rng = np.random.default_rng()
ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]),
index=pd.Series([2006, 2013],
name="Year"))
ppdf
Cancer | Abortion | |
---|---|---|
Year | ||
2006 | 2007371 | 289750 |
2013 | 935573 | 327000 |
ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");
Let's now compute the relative change between the two years...
rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change
Cancer -53.39312 Abortion 12.85591 Name: Percent Change, dtype: float64
ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");
cps = pd.read_csv("edInc2.csv")
cps
educ | gender | income | |
---|---|---|---|
0 | 1 | Men | 517 |
1 | 1 | Women | 409 |
2 | 2 | Men | 751 |
3 | 2 | Women | 578 |
4 | 3 | Men | 872 |
5 | 3 | Women | 661 |
6 | 4 | Men | 1249 |
7 | 4 | Women | 965 |
8 | 5 | Men | 1385 |
9 | 5 | Women | 1049 |
cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
5 | <BA | Women | 661 |
6 | BA | Men | 1249 |
7 | BA | Women | 965 |
8 | >BA | Men | 1385 |
9 | >BA | Women | 1049 |
# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
ax = sns.pointplot(data=cps, x = "Education", y = "Income", hue = "Gender")
ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");
Now, let's compute the income gap as a relative quantity between men and women. Recall that the structure of the dataframe is as follows:
cps.head()
Education | Gender | Income | |
---|---|---|---|
0 | <HS | Men | 517 |
1 | <HS | Women | 409 |
2 | HS | Men | 751 |
3 | HS | Women | 578 |
4 | <BA | Men | 872 |
This calls for using groupby
by Gender, so that we can separate the data for both genders, and then compute the ratio:
cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", "columns")
women = cg.get_group("Women").drop("Gender", "columns")
display(men, women)
Income | |
---|---|
Education | |
<HS | 517 |
HS | 751 |
<BA | 872 |
BA | 1249 |
>BA | 1385 |
Income | |
---|---|
Education | |
<HS | 409 |
HS | 578 |
<BA | 661 |
BA | 965 |
>BA | 1049 |
mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio
Income Ratio (M/F) | |
---|---|
Education | |
<HS | 1.264059 |
HS | 1.299308 |
<BA | 1.319213 |
BA | 1.294301 |
>BA | 1.320305 |
ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");
Let's now compute the alternate ratio, F/M instead:
fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio
Income Ratio (F/M) | |
---|---|
Education | |
<HS | 0.791103 |
HS | 0.769640 |
<BA | 0.758028 |
BA | 0.772618 |
>BA | 0.757401 |
ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");
df = pd.read_csv('baby.csv')
plt.scatter(df['Maternal Height'], df['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.scatter(df['Maternal Height'], df['Birth Weight'], alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.scatter(data=df, x='Maternal Height', y='Birth Weight', alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
r1, r2 = rng.normal(size=(2, len(df)))/3
plt.scatter(df['Maternal Height'] + r1, df['Birth Weight'] + r2, alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
points = [2.2, 2.8, 3.7, 5.3, 5.7]
plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);
Let's define some kernels. We will explain these formulas momentarily. We'll also define some helper functions for visualization purposes.
def gaussian(x, z, a):
# Gaussian kernel
return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))
def boxcar_basic(x, z, a):
# Boxcar kernel
if np.abs(x - z) <= a/2:
return 1/a
return 0
def boxcar(x, z, a):
# Boxcar kernel
cond = np.abs(x - z)
return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )
def create_kde(kernel, pts, a):
# Takes in a kernel, set aof points, and alpha
# Returns the KDE as a function
def f(x):
output = 0
for pt in pts:
output += kernel(x, pt, a)
return output / len(pts) # Normalization factor
return f
def plot_kde(kernel, pts, a):
# Calls create_kde and plots the corresponding KDE
f = create_kde(kernel, pts, a)
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
y = [f(xi) for xi in x]
plt.plot(x, y);
def plot_separate_kernels(kernel, pts, a, norm=False):
# Plots individual kernels, which are then summed to create the KDE
x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
for pt in pts:
y = kernel(x, pt, a)
if norm:
y /= len(pts)
plt.plot(x, y)
plt.show();
Here are our five points.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);
We'll start with the Gaussian kernel.
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)
This looks identical to the smooth curve that sns.distplot
gives us (when we set the appropriate parameter):
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.distplot(points, kde_kws={'bw': 1});
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:1657: FutureWarning: The `bw` parameter is deprecated in favor of `bw_method` and `bw_adjust`. Using 1 for `bw_method`, but please see the docs for the new parameters and update your code. warnings.warn(msg, FutureWarning)
sns.kdeplot(points)
sns.histplot(points, stat='density');
sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');
Gaussian
$$K_{\alpha}(x, x_i) = \frac{1}{\sqrt{2 \pi \alpha^2}} e^{-\frac{(x - x_i)^2}{2\alpha^2}}$$Boxcar
$$K_{\alpha}(x, x_i) = \begin {cases} \frac{1}{\alpha}, \: \: \: |x - x_i| \leq \frac{\alpha}{2}\\ 0, \: \: \: \text{else} \end{cases}$$plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)
plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)
Let's bring in some (different) toy data.
tips = sns.load_dataset('tips')
tips.head()
total_bill | tip | sex | smoker | day | time | size | |
---|---|---|---|---|---|---|---|
0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
vals = tips['total_bill']
ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);
plt.figure(figsize=(8, 5))
plt.ylim(0, 0.15)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)
plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)
co2 = pd.read_csv("CAITcountryCO2.csv", skiprows = 2,
names = ["Country", "Year", "CO2"])
co2.tail()
--------------------------------------------------------------------------- UnicodeDecodeError Traceback (most recent call last) <ipython-input-41-0c57752da17d> in <module> ----> 1 co2 = pd.read_csv("CAITcountryCO2.csv", skiprows = 2, 2 names = ["Country", "Year", "CO2"]) 3 co2.tail() /opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options) 603 kwds.update(kwds_defaults) 604 --> 605 return _read(filepath_or_buffer, kwds) 606 607 /opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds) 455 456 # Create the parser. --> 457 parser = TextFileReader(filepath_or_buffer, **kwds) 458 459 if chunksize or iterator: /opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds) 812 self.options["has_index_names"] = kwds["has_index_names"] 813 --> 814 self._engine = self._make_engine(self.engine) 815 816 def close(self): /opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in _make_engine(self, engine) 1043 ) 1044 # error: Too many arguments for "ParserBase" -> 1045 return mapping[engine](self.f, **self.options) # type: ignore[call-arg] 1046 1047 def _failover_to_python(self): /opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds) 1891 1892 try: -> 1893 self._reader = parsers.TextReader(self.handles.handle, **kwds) 1894 except Exception: 1895 self.handles.close() pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__() pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._get_header() pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows() pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error() UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 107: invalid start byte
last_year = co2.Year.iloc[-1]
last_year
q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty
top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()
from cycler import cycler
linestyles = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(color_c * lines_c)
x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
ax.semilogy(df[x], df[y], label=name)
ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);
x = np.linspace(-3, 3)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))
# x
ax1.plot(x, x)
ax1.set_title('$y=x$')
# powers
ax2.plot(x, x**2, label='$x^2$')
ax2.plot(x, x**3, label='$x^3$')
ax2.legend()
ax2.set_title('$y=x^2$, $y=x^3$')
# log
xpos = x[x>0] # Log is only defined for positive x
ax3.plot(xpos, np.log(xpos))
ax3.set_title(r'$y=\log(x)$')
# exp
ax4.plot(x, np.exp(x))
ax4.set_title('$y=e^x$');
plt.tight_layout();
Let's generate data that follows $y = 2x^3 + \epsilon$, where $\epsilon$ is zero-mean noise. Note that given the functional form of $y$, if we simply draw $\epsilon \sim \mathcal{N}(0,1)$, it will be insignificant for higher values of $x$ (in the range we'll look, $[1..10]$). So we will make $\epsilon \sim x^2\mathcal{N}(0,1)$ so that the noise is present for all values of $x$ and $y$.
x = np.linspace(1, 10, 20)
eps = rng.normal(size=len(x))
y = (2+eps)*x**3
y = 2 * x**3 + x**2*eps
plt.scatter(x, y);
The bulge diagram says to raise $x$ to a power, or to take the log of $y$.
First, let's raise $x$ to a power:
plt.scatter(x**2, y);
We used $x^2$ as the transformation. It's better, but still not linear. Let's try $x^3$.
plt.scatter(x**3, y);
That worked well, which makes sense: the original data was cubic in $x$. We can overdo it, too: let's try $x^5$.
plt.scatter(x**5, y);
Now, the data follows some sort of square root relationship. It's certainly not linear; this goes to show that not all power transformations work the same way, and you'll need some experimentation.
Let's instead try taking the log of y from the original data.
plt.scatter(x, np.log(y));
On it's own, this didn't quite work! Since $y = 2x^3$, $\log(y) = \log(2) + 3\log(x)$.
That means we are essentially plotting plt.scatter(x, np.log(x))
, which is not linear.
In order for this to be linear, we need to take the log of $x$ as well:
plt.scatter(np.log(x), np.log(y));
The relationship being visualized now is
$$\log(y) = \log(2) + 3 \log(x)$$Details and data can be found on Wikipedia.
planets = pd.read_csv("planets.data", delim_whitespace=True, comment="#")
planets
ax = sns.regplot(x='mean_dist', y='period', data=planets, fit_reg=False);
ax.set_title('Relation between period and mean distance to the Sun')
ax.set_xlabel('mean distance [AU]')
ax.set_ylabel('period [days]');
ax = sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), fit_reg=False)
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');
In fact, Kepler's law actually states that:
$$ T^2\propto R^3 $$For Kepler this was a data-driven phenomenological law, formulated in 1619. It could only be explained dynamically once Newton introduced his law of universal gravitation in 1687.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.regplot(x='mean_dist', y='period', data=planets, ax=ax1, ci=False);
sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), ax=ax2);
ax2.set_xlabel('Log(mean_dist)')
ax2.set_ylabel('Log(period)')
ax2.relim()
ax2.autoscale_view()
fig.suptitle("Kepler's third law of planetary motion");