import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='darkgrid', font_scale = 1.5,
              rc={'figure.figsize':(7,5)})

rng = np.random.default_rng()


titanic = sns.load_dataset('titanic')
sns.rugplot(titanic["age"], height = 0.2)
plt.gca().set_ylim([0, 1]);


titanic = sns.load_dataset('titanic')
sns.histplot(titanic["age"]);


titanic = sns.load_dataset('titanic')
sns.displot(titanic["age"], kind = "kde");
plt.savefig("titanic_displot.png", bbox_inches = "tight", dpi = 300)


points = [2.2, 2.8, 3.7, 5.3, 5.7]


plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);


def gaussian(x, z, a):
    # Gaussian kernel
    return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))

def boxcar_basic(x, z, a):
    # Boxcar kernel
    if np.abs(x - z) <= a/2:
        return 1/a
    return 0

def boxcar(x, z, a):
    # Boxcar kernel
    cond = np.abs(x - z)
    return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )


def create_kde(kernel, pts, a):
    # Takes in a kernel, set aof points, and alpha
    # Returns the KDE as a function
    def f(x):
        output = 0
        for pt in pts:
            output += kernel(x, pt, a)
        return output / len(pts) # Normalization factor
    return f

def plot_kde(kernel, pts, a):
    # Calls create_kde and plots the corresponding KDE
    f = create_kde(kernel, pts, a)
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    y = [f(xi) for xi in x]
    plt.plot(x, y);
    
def plot_separate_kernels(kernel, pts, a, norm=False):
    # Plots individual kernels, which are then summed to create the KDE
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    for pt in pts:
        y = kernel(x, pt, a)
        if norm:
            y /= len(pts)
        plt.plot(x, y)
    
    plt.show();


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)


sns.kdeplot(points, bw_method=0.65)  # magic value!
sns.histplot(points, stat='density', bins=2);


sns.histplot(points, bins=2, kde=True, stat='density', 
             kde_kws=dict(cut=3, bw_method=0.65));


sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)


tips = sns.load_dataset('tips')


tips.head()


vals = tips['total_bill']


ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);


plt.figure(figsize=(8, 5))
plt.ylim(0, 0.15)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)


ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]), 
                    index=pd.Series([2006, 2013], 
                    name="Year"))
ppdf


ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");


rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change

Cancer     -53.39312
Abortion    12.85591
Name: Percent Change, dtype: float64


ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");


cps = pd.read_csv("edInc2.csv")
cps


cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps


# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
    ax = sns.pointplot(data=cps, x = "Education", y = "Income", hue = "Gender")

ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");


cps.head()


cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", axis="columns")
women = cg.get_group("Women").drop("Gender", axis="columns")
display(men, women)


mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio


ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");


fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio


ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");


co2 = pd.read_csv("CAITcountryCO2.csv", skiprows = 2,
                  names = ["Country", "Year", "CO2"], encoding = "ISO-8859-1")
co2.tail()


last_year = co2.Year.iloc[-1]
last_year

2012


q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty


top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()

14


from cycler import cycler

linestyles = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)

fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(color_c * lines_c)

x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
    ax.semilogy(df[x], df[y], label=name)

ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);


planets = pd.read_csv("planets.data", delim_whitespace=True, comment="#")
planets


ax = sns.scatterplot(x='mean_dist', y='period', data=planets);
ax.set_title('Relation between period and mean distance to the Sun')
ax.set_xlabel('mean distance [AU]')
ax.set_ylabel('period [days]');


sns.regplot(x='mean_dist', y='period', data=planets, ci=False);


ax = sns.scatterplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']))
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');


ax = sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), ci=False)
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	educ	gender	income
0	1	Men	517
1	1	Women	409
2	2	Men	751
3	2	Women	578
4	3	Men	872
5	3	Women	661
6	4	Men	1249
7	4	Women	965
8	5	Men	1385
9	5	Women	1049

	Country	Year	CO2
30639	Vietnam	2012	173.0497
30640	World	2012	33843.0497
30641	Yemen	2012	20.5386
30642	Zambia	2012	2.7600
30643	Zimbabwe	2012	9.9800

	Country	Year	CO2
30490	China	2012	9312.5329
30634	United States	2012	5122.9094
30514	European Union (28)	2012	3610.5137
30533	India	2012	2075.1808
30596	Russian Federation	2012	1721.5376
30541	Japan	2012	1249.2135
30521	Germany	2012	773.9585
30547	Korea, Rep. (South)	2012	617.2418
30535	Iran	2012	593.8195
30485	Canada	2012	543.0242
30603	Saudi Arabia	2012	480.2278
30478	Brazil	2012	477.7701
30633	United Kingdom	2012	463.4556
30567	Mexico	2012	460.4782

	Country	Year	CO2
18822	Brazil	1950	19.6574
18829	Canada	1950	154.1408
18834	China	1950	78.6478
18858	European Union (28)	1950	1773.6864
18865	Germany	1950	510.7323

Lecture 7 Supplemental Notebook¶

KDEs¶

Step 1: Place a kernel at each point¶

Step 2: Normalize kernels so that total area is 1¶

Step 3: Sum all kernels together¶

Kernels¶

Effect of bandwidth hyperparameter $\alpha$¶

KDE Formula¶

Scale¶

Current Population Survey¶

CO2 Emissions¶

Transformations Demo¶

Kepler's third law¶

	Income Ratio (M/F)
Education
<HS	1.264059
HS	1.299308
<BA	1.319213
BA	1.294301
>BA	1.320305

	Income Ratio (F/M)
Education
<HS	0.791103
HS	0.769640
<BA	0.758028
BA	0.772618
>BA	0.757401

	planet	mean_dist	period	kepler_ratio
0	Mercury	0.389	87.77	7.64
1	Venus	0.724	224.70	7.52
2	Earth	1.000	365.25	7.50
3	Mars	1.524	686.95	7.50
4	Jupiter	5.200	4332.62	7.49
5	Saturn	9.510	10759.20	7.43

	Cancer	Abortion
Year
2006	2007371	289750
2013	935573	327000