%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
Change some of matplotlib's plotting defaults for better class presentation and set a nice Seaborn style.
plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)
kdf = pd.read_csv("data/babies.data", delim_whitespace=True)
kdf.head()
ax = sns.distplot(kdf.bwt, rug=False)
ax.set_xlabel("Birth weight in ounces")
ax.set_title("Normalized birth weight distribution of babies");
kdf2 = pd.read_csv("data/babies23.data", delim_whitespace=True)
kdf2.head()
kdf2.columns
pp = pd.read_csv("data/plannedparenthood.csv")
pp
# Compute percentage within year
pp["perScreen"] = 100 * pp["screening"] / (pp["screening"] + pp["abortion"])
pp["perAbort"] = 100 - pp["perScreen"]
pp
plt.plot(pp['year'], pp['screening'], linestyle="solid", marker="o", label='Cancer')
plt.plot(pp['year'], pp['abortion'], linestyle="solid", marker="o", label='Abortion')
plt.ylabel("Service")
plt.xlabel("Year")
plt.xticks([2006, 2013])
plt.legend();
plt.plot(pp['year'], pp['perScreen'], linestyle="solid", marker="o", label='Cancer')
plt.plot(pp['year'], pp['perAbort'], linestyle="solid", marker="o", label='Abortion')
plt.ylabel("Percentage of Annual Service")
plt.xlabel("Year")
plt.xticks([2006, 2013]);
pp
year = [2006, 2013, 2006, 2013]
proc = ['abortion', 'abortion', 'cancer', 'cancer']
perc = [12.613615, 25.899493, 87.386385, 74.100507]
df = pd.DataFrame(dict(year=year, proc=proc, perc=perc))
df
ax = sns.barplot( y = "perc", x = "proc", hue = "year", data = df)
ax.set_xlabel('Procedure')
ax.set_ylabel('Percentage of annual services')
ax.set_title('Planned Parenthood services');
# sns.set_style("white")
sns.factorplot(x="year", y="perc", col="proc", data=df, aspect=1 )
sns.despine(bottom=True, left=True)
g = sns.FacetGrid(df, row="proc")
g.map(sns.stripplot, "perc","year")
sns.stripplot( x = "perc", y = "proc", hue = "year", data = df)
cps = pd.read_csv("data/edInc2.csv")
cps
# make a factor plot
sns.set(style="whitegrid")
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
ax = sns.pointplot(x = "educ", y = "income", hue = "gender", data = cps)
# need to fix tick mark labels
ticks = ["<HS", "HS", "<BA", "BA", ">BA"]
ax.set_xticklabels(ticks)
ax.set_xlabel("Education")
ax.set_ylabel("Income")
ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");
cb = pd.read_csv("data/cherryBlossomMen.csv")
cb.head()
cb.tail()
#smooth run time as a function of age for 1999 and for 2012
# plot smoothed curves
# SEE PAGE 52 of the old VIZ pptx
vote = pd.read_csv("data/voteCA2016.csv")
vote = vote.rename(columns={c:c.strip() for c in vote.columns})
vote.head()
# Make a time series plot of percent by year
# Need to overlay plots or reshape the data
# SEE PAGE 64 of the old Viz PPTX
plt.plot(vote['year'], vote['dem'], label="Democratic")
plt.plot(vote['year'], vote['rep'], label="Republican")
plt.legend()
plt.savefig("Final.pdf")
vote.plot(kind='line', x='year', y=['dem', 'rep'])
# vote.iplot(kind='line', x='year', y=['dem', 'rep'])
co2 = pd.read_csv("data/CAITcountryCO2.csv", skiprows = 2,
names = ["Country", "Year", "CO2"])
co2.tail()
last_year = co2.Year.iloc[-1]
last_year
q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty
top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()
from cycler import cycler
linestyles = (['-', '--', ':', '-.']*3)[:7]
colors = plt.cm.Dark2.colors[:7]
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)
fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(lines_c + color_c)
x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
ax.semilogy(df[x], df[y], label=name)
ax.set_xlabel(x)
ax.set_ylabel(y + "Emissions [Million Tons]")
ax.legend(ncol=2, frameon=True);
Details and data can be found on Wikipedia.
planets = pd.read_csv("data/planets.data", delim_whitespace=True, comment="#")
planets
ax = sns.regplot('mean_dist', 'period', planets, fit_reg=False);
ax.set_title('Relation between period and mean distance to the Sun')
ax.set_xlabel('mean distance [AU]')
ax.set_ylabel('period [days]');
ax = sns.regplot(np.log(planets['mean_dist']), np.log(planets['period']), fit_reg=False)
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');
In fact, Kepler's law actually states that:
$$ T^2\propto R^3 $$
For Kepler this was a data-driven phenomenological law, formulated in 1619. It could only be explained dynamically once Newton introduced his law of universal gravitation in 1687.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.regplot('mean_dist', 'period', planets, ax=ax1);
sns.regplot(np.log(planets['mean_dist']), np.log(planets['period']), ax=ax2);
ax2.set_xlabel('Log(mean_dist)')
ax2.set_ylabel('Log(period)')
ax2.relim()
ax2.autoscale_view()
fig.suptitle("Kepler's third law of planetary motion");
x = np.linspace(-3, 3)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))
# x
ax1.plot(x, x)
ax1.set_title('$y=x$')
# powers
ax2.plot(x, x**2, label='$x^2$')
ax2.plot(x, x**3, label='$x^3$')
ax2.legend()
ax2.set_title('$y=x^2$, $y=x^3$')
# log
xpos = x[x>0] # Log is only defined for positive x
ax3.plot(xpos, np.log(xpos))
ax3.set_title(r'$y=\log(x)$')
# exp
ax4.plot(x, np.exp(x))
ax4.set_title('$y=e^x$');
plt.tight_layout();