import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


births = pd.read_csv('baby.csv')


births.head()


births.shape

(1174, 6)


births['Maternal Smoker'].value_counts()

False    715
True     459
Name: Maternal Smoker, dtype: int64


births['Maternal Smoker'].value_counts().plot(kind = 'bar');


sns.countplot(x=births['Maternal Smoker']);


# These are made up!
majors = ['Data Science', 'History', 'Biology', 'Business']
gpas = [3.35, 3.20, 2.98, 3.51]


# What if we change bar to barh?
plt.bar(majors, gpas);


sns.barplot(x=majors, y=gpas);


bweights = births["Birth Weight"]


bweights

0       120
1       113
2       128
3       108
4       136
       ... 
1169    113
1170    128
1171    130
1172    125
1173    117
Name: Birth Weight, Length: 1174, dtype: int64


sns.rugplot(bweights);


# By default, you get some arbitrary bins. We often like to pick our own.
plt.hist(bweights);


min(bweights), max(bweights)

(55, 176)


bw_bins = range(50, 200, 5)


plt.hist(bweights, bins=bw_bins, ec='w');


plt.hist(bweights, density=True, bins=bw_bins, ec='w');


# alternative way of getting this plot
bweights.plot(kind = 'hist', density=True, bins=bw_bins, ec='w');


plt.hist(bweights, bins = np.arange(50, 200, 20), density=True, ec='w');


plt.hist(bweights, bins = [50, 100, 120, 140, 200], density=True, ec='w');


sns.kdeplot(bweights);


sns.histplot(bweights, kde=True);


sns.displot(bweights, kde=True, rug=True);


sns.displot(bweights, kind='kde', rug=True);


plt.figure(figsize = (3, 6))
sns.boxplot(y=bweights);


q1 = np.percentile(bweights, 25)
q2 = np.percentile(bweights, 50)
q3 = np.percentile(bweights, 75)
iqr = q3 - q1
whisk1 = q1 - 1.5*iqr
whisk2 = q3 + 1.5*iqr

whisk1, q1, q2, q3, whisk2

(73.5, 108.0, 120.0, 131.0, 165.5)


plt.figure(figsize = (3, 6))
sns.violinplot(y=bweights);


births.head()


sm_bweights = births[births['Maternal Smoker'] == True]['Birth Weight']
nsm_bweights = births[births['Maternal Smoker'] == False]['Birth Weight']


sns.histplot(nsm_bweights, bins=bw_bins, kde=True, stat='density', label='non smoker', ec='w');
sns.histplot(sm_bweights, bins=bw_bins, kde=True, stat='density', label='smoker', color='orange', ec='w');
plt.legend();


plt.figure(figsize=(5, 8))
sns.boxplot(data=births, x = 'Maternal Smoker', y = 'Birth Weight');


plt.figure(figsize=(5, 8))
sns.violinplot(data=births, x = 'Maternal Smoker', y = 'Birth Weight');


two_distributions = [nsm_bweights.values, sm_bweights.values]
groups = ['non-smokers', 'smokers']


plt.boxplot(two_distributions, labels=groups);


plt.violinplot(two_distributions);


births.head()


plt.scatter(births['Maternal Height'], births['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


plt.scatter(data=births, x='Maternal Height', y='Birth Weight');
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


sns.scatterplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue = 'Maternal Smoker');


sns.lmplot(data = births, x = 'Maternal Height', y = 'Birth Weight', ci=False, hue='Maternal Smoker');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', hue='Maternal Smoker');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind='hex');


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind='kde', fill=True);


sns.jointplot(data = births, x = 'Maternal Height', y = 'Birth Weight', kind='kde', hue='Maternal Smoker');


births.plot();

Lecture 9 – Data 100, Summer 2021¶

Bar Plots¶

Rug plots¶

Histograms¶

Density Curves¶

Box Plots¶

Violin Plots¶

Overlaid Histograms and Density Curves¶

Side by side box plots and violin plots¶

Scatter plots¶

Hex plots and contour plots¶

Bonus¶

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight	Maternal Smoker
0	120	284	27	62	100	False
1	113	282	33	64	135	False
2	128	279	28	64	115	True
3	108	282	23	67	125	True
4	136	286	25	62	93	False