by Josh Hug
adapted from material by Ani Adhikari, Suraj Rampure, and Fernando Pérez.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
births = pd.read_csv('baby.csv')
plt.rcParams["hist.bins"]
10
births.head()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | |
---|---|---|---|---|---|---|
0 | 120 | 284 | 27 | 62 | 100 | False |
1 | 113 | 282 | 33 | 64 | 135 | False |
2 | 128 | 279 | 28 | 64 | 115 | True |
3 | 108 | 282 | 23 | 67 | 125 | True |
4 | 136 | 286 | 25 | 62 | 93 | False |
births.shape
(1174, 6)
We often use bar plots to display distributions of a categorical variable:
births['Maternal Smoker'].value_counts()
False 715 True 459 Name: Maternal Smoker, dtype: int64
This would have been the Data 8 code to do something similar:
from datascience import Table
t = Table.from_df(births['Maternal Smoker'].value_counts().reset_index())
t.barh("index", "Maternal Smoker")
births['Maternal Smoker'].value_counts().plot(kind = 'bar');
ms = births['Maternal Smoker'].value_counts();
plt.bar(ms.index, ms);
Note: putting a semicolon after a plot call hides all of the unnecessary text that comes after it (the <matplotlib.axes_....>
).
sns.countplot(data = births, x = 'Maternal Smoker');
import plotly.express as px
px.histogram(births, x = 'Maternal Smoker', color = 'Maternal Smoker')
sns.countplot(data = births, x = 'Maternal Pregnancy Weight');
sns.histplot(data = births, x = 'Maternal Pregnancy Weight');
px.histogram(births, x = 'Maternal Pregnancy Weight')
sns.histplot(data = births, x = 'Maternal Pregnancy Weight', bins = 20);
sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = "red");
sns.histplot(data = births, x = 'Maternal Pregnancy Weight', kde = True);
sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = "red");
plt.figure(figsize = (3, 6))
sns.boxplot(y = "Birth Weight", data = births);
bweights = births["Birth Weight"]
q1 = np.percentile(bweights, 25)
q2 = np.percentile(bweights, 50)
q3 = np.percentile(bweights, 75)
iqr = q3 - q1
whisk1 = q1 - 1.5*iqr
whisk2 = q3 + 1.5*iqr
whisk1, q1, q2, q3, whisk2
(73.5, 108.0, 120.0, 131.0, 165.5)
plt.figure(figsize = (3, 6))
sns.violinplot(y=births["Birth Weight"]);
plt.figure(figsize=(5, 8))
sns.boxplot(data=births, x = 'Maternal Smoker', y = 'Birth Weight');
plt.figure(figsize=(5, 8))
sns.violinplot(data=births, x = 'Maternal Smoker', y = 'Birth Weight');
births.head()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | |
---|---|---|---|---|---|---|
0 | 120 | 284 | 27 | 62 | 100 | False |
1 | 113 | 282 | 33 | 64 | 135 | False |
2 | 128 | 279 | 28 | 64 | 115 | True |
3 | 108 | 282 | 23 | 67 | 125 | True |
4 | 136 | 286 | 25 | 62 | 93 | False |
plt.scatter(births['Maternal Height'], births['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
plt.scatter(data=births, x='Maternal Height', y='Birth Weight');
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');
sns.scatterplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');
births["Maternal Height (jittered)"] = births["Maternal Height"] + np.random.uniform(-0.2, 0.2, len(births))
fig = sns.scatterplot(data = births, x = 'Maternal Height (jittered)', y = 'Birth Weight', hue = 'Maternal Smoker');
sns.lmplot(data = births, x = 'Maternal Height', y = 'Birth Weight', ci=False, hue='Maternal Smoker');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue='Maternal Smoker');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind='hex');
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind='kde', fill=True);
sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind='kde', hue='Maternal Smoker');
Calling the DataFrame .plot()
method results in weird things!
births.plot();