by Josh Hug and Narges Norouzi
Adapted from material by Ani Adhikari, Suraj Rampure, and Fernando Pérez.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
births = pd.read_csv('data/baby.csv')
births.head()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | |
---|---|---|---|---|---|---|
0 | 120 | 284 | 27 | 62 | 100 | False |
1 | 113 | 282 | 33 | 64 | 135 | False |
2 | 128 | 279 | 28 | 64 | 115 | True |
3 | 108 | 282 | 23 | 67 | 125 | True |
4 | 136 | 286 | 25 | 62 | 93 | False |
births.shape
(1174, 6)
We often use bar plots to display distributions of a categorical variable:
births['Maternal Smoker'].value_counts()
False 715 True 459 Name: Maternal Smoker, dtype: int64
This would have been the Data 8 code to do something similar:
from datascience import Table
t = Table.from_df(births['Maternal Smoker'].value_counts().reset_index())
t.barh("index", "Maternal Smoker")
births['Maternal Smoker'].value_counts().plot(kind = 'bar');
ms = births['Maternal Smoker'].value_counts();
plt.bar(ms.index.astype('string'), ms);
Note: putting a semicolon after a plot call hides all of the unnecessary text that comes after it (the <matplotlib.axes_....>
).
sns.countplot(data = births, x = 'Maternal Smoker');
import plotly.express as px
px.histogram(births, x = 'Maternal Smoker', color = 'Maternal Smoker')
sns.countplot(data = births, x = 'Maternal Pregnancy Weight');
sns.histplot(data = births, x = 'Maternal Pregnancy Weight', bins = 20);
px.histogram(births, x = 'Maternal Pregnancy Weight')
sns.histplot(data = births, x = 'Maternal Pregnancy Weight');
sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = 'red');
median = births['Maternal Pregnancy Weight'].median()
mean = births['Maternal Pregnancy Weight'].mean()
print("Median", median)
print("Mean", mean)
Median 125.0 Mean 128.4787052810903
sns.histplot(data = births, x = 'Maternal Pregnancy Weight', kde = True);
sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = 'red');
q1, median, q3 = np.percentile(births['Birth Weight'], [25, 50, 75])
iqr = q3 - q1
births['category'] = None
births.loc[(births['Birth Weight'] < q1) | (births['Birth Weight'] > q3), 'category'] = 'Outside of the middle 50%'
births.loc[(births['Birth Weight'] > q1) & (births['Birth Weight'] < q3), 'category'] = 'In the middle 50%'
sns.histplot(births, x = 'Birth Weight', hue = 'category', bins = 30);
births.drop(columns = ['category'], inplace = True)
sns.displot(data = births, x = 'Birth Weight', stat = 'density', hue = 'Maternal Smoker');
sns.displot(data = births, x = 'Birth Weight', kde = True, stat = 'density', hue = 'Maternal Smoker');
sns.displot(data = births, x = 'Birth Weight', kind = 'kde', hue = 'Maternal Smoker');
plt.figure(figsize = (3, 6))
sns.boxplot(data = births, y = 'Birth Weight');
bweights = births['Birth Weight']
q1 = np.percentile(bweights, 25)
q2 = np.percentile(bweights, 50)
q3 = np.percentile(bweights, 75)
iqr = q3 - q1
whisk1 = q1 - 1.5*iqr
whisk2 = q3 + 1.5*iqr
whisk1, q1, q2, q3, whisk2
(73.5, 108.0, 120.0, 131.0, 165.5)
plt.figure(figsize = (3, 6))
sns.violinplot(y = births['Birth Weight']);
plt.figure(figsize=(5, 8))
sns.boxplot(data = births, x = 'Maternal Smoker', y = 'Birth Weight');
plt.figure(figsize=(5, 8))
sns.violinplot(data = births, x = 'Maternal Smoker', y = 'Birth Weight');
births.head()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | |
---|---|---|---|---|---|---|
0 | 120 | 284 | 27 | 62 | 100 | False |
1 | 113 | 282 | 33 | 64 | 135 | False |
2 | 128 | 279 | 28 | 64 | 115 | True |
3 | 108 | 282 | 23 | 67 | 125 | True |
4 | 136 | 286 | 25 | 62 | 93 | False |
plt.scatter(births['Maternal Height'], births['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
Most matplotlib
functions also accept a data=
keyword, and when using this mode, you can then refer to x and y as names of columns in the data
DataFrame, instead of passing the series explicitly:
plt.scatter(data = births, x = 'Maternal Height', y = 'Birth Weight');
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
sns.scatterplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');
births['Maternal Height (jittered)'] = births['Maternal Height'] + np.random.uniform(-0.2, 0.2, len(births))
fig = sns.scatterplot(data = births, x = 'Maternal Height (jittered)', y = 'Birth Weight', hue = 'Maternal Smoker');
sns.lmplot(data = births, x = 'Maternal Height', y = 'Birth Weight',
ci = False, hue = 'Maternal Smoker');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'hex');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'kde', fill = True);
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'kde', hue = 'Maternal Smoker');