by Josh Hug
adapted from material by Ani Adhikari, Suraj Rampure, and Fernando Pérez.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
births = pd.read_csv('baby.csv')
births.head()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | |
---|---|---|---|---|---|---|
0 | 120 | 284 | 27 | 62 | 100 | False |
1 | 113 | 282 | 33 | 64 | 135 | False |
2 | 128 | 279 | 28 | 64 | 115 | True |
3 | 108 | 282 | 23 | 67 | 125 | True |
4 | 136 | 286 | 25 | 62 | 93 | False |
births.shape
(1174, 6)
We often use bar plots to display distributions of a categorical variable:
births['Maternal Smoker'].value_counts()
False 715 True 459 Name: Maternal Smoker, dtype: int64
from datascience import Table
t = Table.from_df(births['Maternal Smoker'].value_counts().reset_index())
t.barh("index", "Maternal Smoker")
births['Maternal Smoker'].value_counts().plot(kind = 'bar');
ms = births['Maternal Smoker'].value_counts();
plt.bar(ms.index, ms);
Note: putting a semicolon after a plot call hides all of the unnecessary text that comes after it (the <matplotlib.axes_....>
).
sns.countplot(data = births, x = 'Maternal Smoker');
import plotly.express as px
px.histogram(births, x = 'Maternal Smoker', color = 'Maternal Smoker')