Some data visualization examples¶

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

Change some of matplotlib's plotting defaults for better class presentation and set a nice Seaborn style.

plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)

Kaiser births¶

Data from Deb Nolan's StatLab.

kdf = pd.read_csv("data/babies.data", delim_whitespace=True)
kdf.head()

ax = sns.distplot(kdf.bwt, rug=False)
ax.set_xlabel("Birth weight in ounces")
ax.set_title("Normalized birth weight distribution of babies");

kdf2 = pd.read_csv("data/babies23.data", delim_whitespace=True)
kdf2.head()

kdf2.columns

Index(['id', 'pluralty', 'outcome', 'date', 'gestation', 'sex', 'wt', 'parity',
       'race', 'age', 'ed', 'ht', 'wt.1', 'drace', 'dage', 'ded', 'dht', 'dwt',
       'marital', 'inc', 'smoke', 'time', 'number'],
      dtype='object')

Planned Parenthood¶

pp = pd.read_csv("data/plannedparenthood.csv")
pp

# Compute percentage within year
pp["perScreen"] = 100 * pp["screening"] / (pp["screening"] + pp["abortion"])
pp["perAbort"] = 100 - pp["perScreen"]
pp

plt.plot(pp['year'], pp['screening'], linestyle="solid", marker="o", label='Cancer')
plt.plot(pp['year'], pp['abortion'], linestyle="solid", marker="o", label='Abortion')
plt.ylabel("Service")
plt.xlabel("Year")
plt.xticks([2006, 2013])
plt.legend();

plt.plot(pp['year'], pp['perScreen'], linestyle="solid", marker="o", label='Cancer')
plt.plot(pp['year'], pp['perAbort'], linestyle="solid", marker="o", label='Abortion')
plt.ylabel("Percentage of Annual Service")
plt.xlabel("Year")
plt.xticks([2006, 2013]);

pp

year = [2006, 2013, 2006, 2013]
proc = ['abortion', 'abortion', 'cancer', 'cancer']
perc = [12.613615, 25.899493, 87.386385, 74.100507]
df = pd.DataFrame(dict(year=year, proc=proc, perc=perc))
df

ax = sns.barplot( y = "perc", x = "proc", hue = "year", data = df)
ax.set_xlabel('Procedure')
ax.set_ylabel('Percentage of annual services')
ax.set_title('Planned Parenthood services');

# sns.set_style("white")
sns.factorplot(x="year", y="perc", col="proc", data=df, aspect=1 )
sns.despine(bottom=True, left=True)

g = sns.FacetGrid(df, row="proc")
g.map(sns.stripplot, "perc","year")

/Users/fperez/usr/conda/envs/data100/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the stripplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)

<seaborn.axisgrid.FacetGrid at 0x1030e5320>

sns.stripplot( x = "perc", y = "proc", hue = "year", data = df)

<matplotlib.axes._subplots.AxesSubplot at 0x1026ad278>

Current Population Survey¶

cps = pd.read_csv("data/edInc2.csv")
cps

# make a factor plot
sns.set(style="whitegrid")

blue_red = ["#397eb7", "#bf1518"]

with sns.color_palette(sns.color_palette(blue_red)):
    ax = sns.pointplot(x = "educ", y = "income", hue = "gender", data = cps)
    
# need to fix tick mark labels
ticks = ["<HS", "HS", "<BA", "BA", ">BA"]
ax.set_xticklabels(ticks)
ax.set_xlabel("Education")
ax.set_ylabel("Income")
ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");

Cherry Blossom Run¶

cb = pd.read_csv("data/cherryBlossomMen.csv")
cb.head()

cb.tail()

#smooth run time as a function of age for 1999 and for 2012
# plot smoothed curves

# SEE PAGE 52 of the old VIZ pptx

Voter Registration¶

vote = pd.read_csv("data/voteCA2016.csv")
vote = vote.rename(columns={c:c.strip() for c in vote.columns})
vote.head()

# Make a time series plot of percent by year
# Need to overlay plots or reshape the data
# SEE PAGE 64 of the old Viz PPTX
plt.plot(vote['year'], vote['dem'], label="Democratic")
plt.plot(vote['year'], vote['rep'], label="Republican")
plt.legend()
plt.savefig("Final.pdf")

vote.plot(kind='line', x='year', y=['dem', 'rep'])

/Users/fperez/usr/conda/envs/data100/lib/python3.6/site-packages/pandas/plotting/_core.py:1716: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access
  series.name = label

<matplotlib.axes._subplots.AxesSubplot at 0x1a1216f828>

# vote.iplot(kind='line', x='year', y=['dem', 'rep'])

CO2 Emissions¶

co2 = pd.read_csv("data/CAITcountryCO2.csv", skiprows = 2,
                  names = ["Country", "Year", "CO2"])
co2.tail()

last_year = co2.Year.iloc[-1]
last_year

2012

q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty

top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()

14

from cycler import cycler

linestyles = (['-', '--', ':', '-.']*3)[:7]
colors = plt.cm.Dark2.colors[:7]
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)

fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(lines_c + color_c)

x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
    ax.semilogy(df[x], df[y], label=name)

ax.set_xlabel(x)
ax.set_ylabel(y + "Emissions [Million Tons]")
ax.legend(ncol=2, frameon=True);

Kepler's third law¶

Details and data can be found on Wikipedia.

planets = pd.read_csv("data/planets.data", delim_whitespace=True, comment="#")
planets

ax = sns.regplot('mean_dist', 'period', planets, fit_reg=False);
ax.set_title('Relation between period and mean distance to the Sun')
ax.set_xlabel('mean distance [AU]')
ax.set_ylabel('period [days]');

ax = sns.regplot(np.log(planets['mean_dist']), np.log(planets['period']), fit_reg=False)
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');

In fact, Kepler's law actually states that:

$$ T^2\propto R^3 $$

For Kepler this was a data-driven phenomenological law, formulated in 1619. It could only be explained dynamically once Newton introduced his law of universal gravitation in 1687.

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.regplot('mean_dist', 'period', planets, ax=ax1);

sns.regplot(np.log(planets['mean_dist']), np.log(planets['period']), ax=ax2);
ax2.set_xlabel('Log(mean_dist)')
ax2.set_ylabel('Log(period)')
ax2.relim()
ax2.autoscale_view()
fig.suptitle("Kepler's third law of planetary motion");

Functional relations¶

x = np.linspace(-3, 3)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))

# x
ax1.plot(x, x)
ax1.set_title('$y=x$')

# powers
ax2.plot(x, x**2, label='$x^2$')
ax2.plot(x, x**3, label='$x^3$')
ax2.legend()
ax2.set_title('$y=x^2$, $y=x^3$')

# log
xpos = x[x>0]  # Log is only defined for positive x
ax3.plot(xpos, np.log(xpos))
ax3.set_title(r'$y=\log(x)$')

# exp
ax4.plot(x, np.exp(x))
ax4.set_title('$y=e^x$');
plt.tight_layout();

	bwt	gestation	age	height	weight	smoke
0	120	284	27	62	100	0
1	113	282	33	64	135	0
2	128	279	28	64	115	1
3	123	999	36	69	190	0
4	108	282	23	67	125	1

	educ	gender	income
0	1	Men	517
1	1	Women	409
2	2	Men	751
3	2	Women	578
4	3	Men	872
5	3	Women	661
6	4	Men	1249
7	4	Women	965
8	5	Men	1385
9	5	Women	1049

	year	place	age	time
0	1999	1	28.0	2819.0
1	1999	2	24.0	2821.0
2	1999	3	27.0	2823.0
3	1999	4	28.0	2827.0
4	1999	5	26.0	2851.0

	year	place	age	time
70065	2012	7189	39.0	8831.0
70066	2012	7190	56.0	8840.0
70067	2012	7191	35.0	8850.0
70068	2012	7192	NaN	8938.0
70069	2012	7193	48.0	9059.0

	year	eligible	registered	dem	rep	other	decline
0	1992	20612814	13217022	0.485	0.389	0.031	0.095
1	1996	19298379	14314658	0.474	0.368	0.052	0.106
2	2000	21190865	14676174	0.462	0.349	0.052	0.137
3	2004	21843202	14945031	0.432	0.357	0.049	0.162
4	2008	22987562	15468551	0.427	0.336	0.044	0.193

	bwt	gestation	age	height	weight	smoke
0	120	284	27	62	100	0
1	113	282	33	64	135	0
2	128	279	28	64	115	1
3	123	999	36	69	190	0
4	108	282	23	67	125	1

	id	pluralty	outcome	date	gestation	sex	wt	parity	race	age	...	drace	dage	ded	dht	dwt	marital	inc	smoke	time	number
0	15	5	1	1411	284	1	120	1	8	27	...	8	31	5	65	110	1	1	0	0	0
1	20	5	1	1499	282	1	113	2	0	33	...	0	38	5	70	148	1	4	0	0	0
2	58	5	1	1576	279	1	128	1	0	28	...	5	32	1	99	999	1	2	1	1	1
3	61	5	1	1504	999	1	123	2	0	36	...	3	43	4	68	197	1	8	3	5	5
4	72	5	1	1425	282	1	108	1	0	23	...	0	24	5	99	999	1	1	1	1	5

	year	screening	abortion	perScreen	perAbort
0	2006	2007371	289750	87.386385	12.613615
1	2013	935573	327000	74.100507	25.899493

	perc	proc	year
0	12.613615	abortion	2006
1	25.899493	abortion	2013
2	87.386385	cancer	2006
3	74.100507	cancer	2013

	Country	Year	CO2
30639	Vietnam	2012	173.0497
30640	World	2012	33843.0497
30641	Yemen	2012	20.5386
30642	Zambia	2012	2.7600
30643	Zimbabwe	2012	9.9800

	Country	Year	CO2
30490	China	2012	9312.5329
30634	United States	2012	5122.9094
30514	European Union (28)	2012	3610.5137
30533	India	2012	2075.1808
30596	Russian Federation	2012	1721.5376
30541	Japan	2012	1249.2135
30521	Germany	2012	773.9585
30547	Korea, Rep. (South)	2012	617.2418
30535	Iran	2012	593.8195
30485	Canada	2012	543.0242
30603	Saudi Arabia	2012	480.2278
30478	Brazil	2012	477.7701
30633	United Kingdom	2012	463.4556
30567	Mexico	2012	460.4782

	Country	Year	CO2
18822	Brazil	1950	19.6574
18829	Canada	1950	154.1408
18834	China	1950	78.6478
18858	European Union (28)	1950	1773.6864
18865	Germany	1950	510.7323

	planet	mean_dist	period	kepler_ratio
0	Mercury	0.389	87.77	7.64
1	Venus	0.724	224.70	7.52
2	Earth	1.000	365.25	7.50
3	Mars	1.524	686.95	7.50
4	Jupiter	5.200	4332.62	7.49
5	Saturn	9.510	10759.20	7.43

	bwt	gestation	age	height	weight	smoke
0	120	284	27	62	100	0
1	113	282	33	64	135	0
2	128	279	28	64	115	1
3	123	999	36	69	190	0
4	108	282	23	67	125	1

	bwt	gestation	age	height	weight	smoke
0	120	284	27	62	100	0
1	113	282	33	64	135	0
2	128	279	28	64	115	1
3	123	999	36	69	190	0
4	108	282	23	67	125	1