import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


births = pd.read_csv('data/baby.csv')


births.head()


births.shape

(1174, 6)


births['Maternal Smoker'].value_counts()

False    715
True     459
Name: Maternal Smoker, dtype: int64

from datascience import Table
t = Table.from_df(births['Maternal Smoker'].value_counts().reset_index())
t.barh("index", "Maternal Smoker")


births['Maternal Smoker'].value_counts().plot(kind = 'bar');


ms = births['Maternal Smoker'].value_counts();
plt.bar(ms.index.astype('string'), ms);


sns.countplot(data = births, x = 'Maternal Smoker');


import plotly.express as px
px.histogram(births, x = 'Maternal Smoker', color = 'Maternal Smoker')


sns.countplot(data = births, x = 'Maternal Pregnancy Weight');


sns.histplot(data = births, x = 'Maternal Pregnancy Weight', bins = 20);


px.histogram(births, x = 'Maternal Pregnancy Weight')


sns.histplot(data = births, x = 'Maternal Pregnancy Weight');
sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = 'red');


median = births['Maternal Pregnancy Weight'].median()
mean = births['Maternal Pregnancy Weight'].mean()

print("Median", median)
print("Mean", mean)

Median 125.0
Mean 128.4787052810903


sns.histplot(data = births, x = 'Maternal Pregnancy Weight', kde = True);
sns.rugplot(data = births, x = 'Maternal Pregnancy Weight', color = 'red');


q1, median, q3 = np.percentile(births['Birth Weight'], [25, 50, 75])
iqr = q3 - q1

births['category'] = None
births.loc[(births['Birth Weight'] < q1) | (births['Birth Weight'] > q3), 'category'] = 'Outside of the middle 50%'
births.loc[(births['Birth Weight'] > q1) & (births['Birth Weight'] < q3), 'category'] = 'In the middle 50%'

sns.histplot(births, x = 'Birth Weight', hue = 'category', bins = 30);

births.drop(columns = ['category'], inplace = True)


sns.displot(data = births, x = 'Birth Weight', stat = 'density', hue = 'Maternal Smoker');


sns.displot(data = births, x = 'Birth Weight', kde = True, stat = 'density', hue = 'Maternal Smoker');


sns.displot(data = births, x = 'Birth Weight', kind = 'kde', hue = 'Maternal Smoker');


plt.figure(figsize = (3, 6))
sns.boxplot(data = births, y = 'Birth Weight');


bweights = births['Birth Weight']
q1 = np.percentile(bweights, 25)
q2 = np.percentile(bweights, 50)
q3 = np.percentile(bweights, 75)
iqr = q3 - q1
whisk1 = q1 - 1.5*iqr
whisk2 = q3 + 1.5*iqr

whisk1, q1, q2, q3, whisk2

(73.5, 108.0, 120.0, 131.0, 165.5)


plt.figure(figsize = (3, 6))
sns.violinplot(y = births['Birth Weight']);


plt.figure(figsize=(5, 8))
sns.boxplot(data = births, x = 'Maternal Smoker', y = 'Birth Weight');


plt.figure(figsize=(5, 8))
sns.violinplot(data = births, x = 'Maternal Smoker', y = 'Birth Weight');


births.head()


plt.scatter(births['Maternal Height'], births['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


plt.scatter(data = births, x = 'Maternal Height', y = 'Birth Weight');
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


sns.scatterplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');


births['Maternal Height (jittered)'] = births['Maternal Height'] + np.random.uniform(-0.2, 0.2, len(births))
fig = sns.scatterplot(data = births, x = 'Maternal Height (jittered)', y = 'Birth Weight', hue = 'Maternal Smoker');


sns.lmplot(data = births, x = 'Maternal Height', y = 'Birth Weight', 
           ci = False, hue = 'Maternal Smoker');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'hex');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'kde', fill = True);


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind = 'kde', hue = 'Maternal Smoker');

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
3	108	282	23	67	125	True
4	136	286	25	62	93	False

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
3	108	282	23	67	125	True
4	136	286	25	62	93	False

Lecture 7 – Data 100, Spring 2023¶

Bar Plots¶

Box Plots¶

Violin Plots¶

Side by side box plots and violin plots¶

Scatter plots¶

Hex plots and contour plots¶