import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme(style='darkgrid', font_scale = 1.5,
              rc={'figure.figsize':(7,5)})

#plt.rc('figure', dpi=100, figsize=(7, 5))
#plt.rc('font', size=12)
rng = np.random.default_rng()


ppdf = pd.DataFrame(dict(Cancer=[2007371, 935573], Abortion=[289750, 327000]), 
                    index=pd.Series([2006, 2013], 
                    name="Year"))
ppdf


ax = sns.lineplot(data=ppdf, markers=True)
ax.set_title("Planned Parenthood Procedures")
ax.set_xticks([2006, 2013])
ax.set_ylabel("Service count");


rel_change = 100*(ppdf.loc[2013] - ppdf.loc[2006])/ppdf.loc[2006]
rel_change.name = "Percent Change"
rel_change

Cancer     -53.39312
Abortion    12.85591
Name: Percent Change, dtype: float64


ax = sns.barplot(x=rel_change.index, y=rel_change)
ax.axhline(0, color='black')
ax.set_title("Percent Change in Number of Procedures");


cps = pd.read_csv("edInc2.csv")
cps


cps = cps.replace({'educ':{1:"<HS", 2:"HS", 3:"<BA", 4:"BA", 5:">BA"}})
cps.columns = ['Education', 'Gender', 'Income']
cps


# Let's pick our colors specifically using color_palette()
blue_red = ["#397eb7", "#bf1518"]
with sns.color_palette(sns.color_palette(blue_red)):
    ax = sns.pointplot(data=cps, x = "Education", y = "Income", hue = "Gender")

ax.set_title("2014 Median Weekly Earnings\nFull-Time Workers over 25 years old");


cps.head()


cg = cps.set_index("Education").groupby("Gender")
men = cg.get_group("Men").drop("Gender", "columns")
women = cg.get_group("Women").drop("Gender", "columns")
display(men, women)


mfratio = men/women
mfratio.columns = ["Income Ratio (M/F)"]
mfratio


ax = sns.lineplot(data=mfratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("M/F Income Ratio as a function of education level");


fmratio = women/men
fmratio.columns = ["Income Ratio (F/M)"]
fmratio


ax = sns.lineplot(data=fmratio, markers=True, legend=False);
ax.set_ylabel("Ratio")
ax.set_title("F/M Income Ratio as a function of education level");


df = pd.read_csv('baby.csv')


plt.scatter(df['Maternal Height'], df['Birth Weight']);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


plt.scatter(df['Maternal Height'], df['Birth Weight'], alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


plt.scatter(data=df, x='Maternal Height', y='Birth Weight', alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


r1, r2 = rng.normal(size=(2, len(df)))/3

plt.scatter(df['Maternal Height'] + r1, df['Birth Weight'] + r2, alpha = 0.4);
plt.xlabel('Maternal Height')
plt.ylabel('Birth Weight');


points = [2.2, 2.8, 3.7, 5.3, 5.7]


plt.hist(points, bins=range(0, 10, 2), ec='w', density=True);


def gaussian(x, z, a):
    # Gaussian kernel
    return (1/np.sqrt(2*np.pi*a**2)) * np.exp((-(x - z)**2 / (2 * a**2)))

def boxcar_basic(x, z, a):
    # Boxcar kernel
    if np.abs(x - z) <= a/2:
        return 1/a
    return 0

def boxcar(x, z, a):
    # Boxcar kernel
    cond = np.abs(x - z)
    return np.piecewise(x, [cond <= a/2, cond > a/2], [1/a, 0] )


def create_kde(kernel, pts, a):
    # Takes in a kernel, set aof points, and alpha
    # Returns the KDE as a function
    def f(x):
        output = 0
        for pt in pts:
            output += kernel(x, pt, a)
        return output / len(pts) # Normalization factor
    return f

def plot_kde(kernel, pts, a):
    # Calls create_kde and plots the corresponding KDE
    f = create_kde(kernel, pts, a)
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    y = [f(xi) for xi in x]
    plt.plot(x, y);
    
def plot_separate_kernels(kernel, pts, a, norm=False):
    # Plots individual kernels, which are then summed to create the KDE
    x = np.linspace(min(pts) - 5, max(pts) + 5, 1000)
    for pt in pts:
        y = kernel(x, pt, a)
        if norm:
            y /= len(pts)
        plt.plot(x, y)
    
    plt.show();


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.rugplot(points, height = 0.5);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_separate_kernels(gaussian, points, a = 1, norm = True);


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plot_kde(gaussian, points, a = 1)


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
sns.distplot(points, kde_kws={'bw': 1});

/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
/opt/conda/lib/python3.8/site-packages/seaborn/distributions.py:1657: FutureWarning: The `bw` parameter is deprecated in favor of `bw_method` and `bw_adjust`. Using 1 for `bw_method`, but please see the docs for the new parameters and update your code.
  warnings.warn(msg, FutureWarning)


sns.kdeplot(points)
sns.histplot(points, stat='density');


sns.kdeplot(points, bw_adjust=2)
sns.histplot(points, stat='density');


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, points, a = 1)


plt.xlim(-3, 10)
plt.ylim(0, 0.5)
plt.title(r'KDE of toy data with Boxcar kernel and $\alpha$ = 1')
plot_kde(boxcar, points, a = 1)


tips = sns.load_dataset('tips')


tips.head()


vals = tips['total_bill']


ax = sns.histplot(vals)
sns.rugplot(vals, color='orange', ax=ax);


plt.figure(figsize=(8, 5))
plt.ylim(0, 0.15)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 0.1')
plot_kde(gaussian, vals, a = 0.1)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 1')
plot_kde(gaussian, vals, a = 1)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 2')
plot_kde(gaussian, vals, a = 2)


plt.ylim(0, 0.1)
plt.title(r'KDE of tips with Gaussian kernel and $\alpha$ = 10')
plot_kde(gaussian, vals, a = 5)


co2 = pd.read_csv("CAITcountryCO2.csv", skiprows = 2,
                  names = ["Country", "Year", "CO2"])
co2.tail()

---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-41-0c57752da17d> in <module>
----> 1 co2 = pd.read_csv("CAITcountryCO2.csv", skiprows = 2,
      2                   names = ["Country", "Year", "CO2"])
      3 co2.tail()

/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    603     kwds.update(kwds_defaults)
    604 
--> 605     return _read(filepath_or_buffer, kwds)
    606 
    607 

/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    455 
    456     # Create the parser.
--> 457     parser = TextFileReader(filepath_or_buffer, **kwds)
    458 
    459     if chunksize or iterator:

/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    812             self.options["has_index_names"] = kwds["has_index_names"]
    813 
--> 814         self._engine = self._make_engine(self.engine)
    815 
    816     def close(self):

/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1043             )
   1044         # error: Too many arguments for "ParserBase"
-> 1045         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1046 
   1047     def _failover_to_python(self):

/opt/conda/lib/python3.8/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1891 
   1892         try:
-> 1893             self._reader = parsers.TextReader(self.handles.handle, **kwds)
   1894         except Exception:
   1895             self.handles.close()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._get_header()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._tokenize_rows()

pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 107: invalid start byte


last_year = co2.Year.iloc[-1]
last_year


q = f"Country != 'World' and Country != 'European Union (15)' and Year == {last_year}"
top14_lasty = co2.query(q).sort_values('CO2', ascending=False).iloc[:14]
top14_lasty


top14 = co2[co2.Country.isin(top14_lasty.Country) & (co2.Year >= 1950)]
print(len(top14.Country.unique()))
top14.head()


from cycler import cycler

linestyles = ['-', '--', ':', '-.' ]
colors = plt.cm.Dark2.colors
lines_c = cycler('linestyle', linestyles)
color_c = cycler('color', colors)

fig, ax = plt.subplots(figsize=(8, 8))
ax.set_prop_cycle(color_c * lines_c)

x, y ='Year', 'CO2'
for name, df in top14.groupby('Country'):
    ax.semilogy(df[x], df[y], label=name)

ax.set_xlabel(x)
ax.set_ylabel(f"{y} Emissions (million tons)")
ax.legend(ncol=2, frameon=True, fontsize=11);


x = np.linspace(-3, 3)
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(8,8))

# x
ax1.plot(x, x)
ax1.set_title('$y=x$')

# powers
ax2.plot(x, x**2, label='$x^2$')
ax2.plot(x, x**3, label='$x^3$')
ax2.legend()
ax2.set_title('$y=x^2$, $y=x^3$')

# log
xpos = x[x>0]  # Log is only defined for positive x
ax3.plot(xpos, np.log(xpos))
ax3.set_title(r'$y=\log(x)$')

# exp
ax4.plot(x, np.exp(x))
ax4.set_title('$y=e^x$');
plt.tight_layout();


x = np.linspace(1, 10, 20)
eps = rng.normal(size=len(x))
y = (2+eps)*x**3
y = 2 * x**3 + x**2*eps


plt.scatter(x, y);


plt.scatter(x**2, y);


plt.scatter(x**3, y);


plt.scatter(x**5, y);


plt.scatter(x, np.log(y));


plt.scatter(np.log(x), np.log(y));


planets = pd.read_csv("planets.data", delim_whitespace=True, comment="#")
planets


ax = sns.regplot(x='mean_dist', y='period', data=planets, fit_reg=False);
ax.set_title('Relation between period and mean distance to the Sun')
ax.set_xlabel('mean distance [AU]')
ax.set_ylabel('period [days]');


ax = sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), fit_reg=False)
ax.set_title('Log-Log relation between period and mean distance to the Sun')
ax.set_xlabel('Log(mean distance [AU])')
ax.set_ylabel('Log(period [days])');


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
sns.regplot(x='mean_dist', y='period', data=planets, ax=ax1, ci=False);

sns.regplot(x=np.log(planets['mean_dist']), y=np.log(planets['period']), ax=ax2);
ax2.set_xlabel('Log(mean_dist)')
ax2.set_ylabel('Log(period)')
ax2.relim()
ax2.autoscale_view()
fig.suptitle("Kepler's third law of planetary motion");

	educ	gender	income
0	1	Men	517
1	1	Women	409
2	2	Men	751
3	2	Women	578
4	3	Men	872
5	3	Women	661
6	4	Men	1249
7	4	Women	965
8	5	Men	1385
9	5	Women	1049

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

Lecture 10 Supplemental Notebook¶

Scale¶

Current Population Survey¶

Overplotting¶

Kernel Density Estimates¶

Step 1: Place a kernel at each point¶

Step 2: Normalize kernels so that total area is 1¶

Step 3: Sum all kernels together¶

Kernels¶

Effect of bandwidth hyperparameter $\alpha$¶

KDE Formula¶

CO2 Emissions¶

Functional relations¶

Transformations¶

A synthetic example¶

Kepler's third law¶

	Income Ratio (M/F)
Education
<HS	1.264059
HS	1.299308
<BA	1.319213
BA	1.294301
>BA	1.320305

	Income Ratio (F/M)
Education
<HS	0.791103
HS	0.769640
<BA	0.758028
BA	0.772618
>BA	0.757401

	Cancer	Abortion
Year
2006	2007371	289750
2013	935573	327000