import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import cufflinks as cf
cf.set_config_file(offline=True, sharing=False, theme='ggplot');

from scipy.optimize import minimize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:111: UserWarning:

The Shapely GEOS version (3.10.2-CAPI-1.16.0) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.


# formatting options

# big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
    plt.rcParams['font.size'] = SMALL_SIZE
    plt.rcParams['axes.titlesize'] = SMALL_SIZE
    plt.rcParams['axes.labelsize'] = MEDIUM_SIZE
    plt.rcParams['xtick.labelsize'] = SMALL_SIZE
    plt.rcParams['ytick.labelsize'] = SMALL_SIZE
    plt.rcParams['legend.fontsize'] = SMALL_SIZE
    plt.rcParams['figure.titlesize'] = BIGGER_SIZE
    # plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    # plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    # plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    # plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    # plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    # plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    # plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
def savefig(fname):
    if not os.path.exists("images"):
        os.mkdir("images")
    fig = plt.gcf()
    fig.patch.set_alpha(0.0)
    plt.savefig(f"images/{fname}.png", bbox_inches = 'tight');
    
    
# plt.rcParams['figure.figsize'] = (4, 4)
plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
#sns.set()
adjust_fontsize(20)


import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
data


# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0).astype(int)


data.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'malignant'],
      dtype='object')


data[['mean radius', 'malignant']]


sns.jointplot(data = data, x = "mean radius", y = "malignant");
#savefig("overplot")


# manual to allow for jitter
g = sns.JointGrid(data = data, x = "mean radius", y = "malignant")
g.plot_marginals(sns.histplot)
g.plot_joint(sns.stripplot,
             orient='h', order=[1, 0],
             color=sns.color_palette()[0])
(g.ax_joint).set_xticks([10, 15, 20, 25])
savefig("jitter")
plt.show()


sns.histplot(data = data, x = "mean radius", hue = "malignant",
             binwidth=0.5,
             kde=True);
plt.legend(labels=["Malignant", "Benign"])

<matplotlib.legend.Legend at 0x7ff425271fa0>


from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.10, random_state=42)
data_tr.reset_index(inplace=True, drop=True)
data_te.reset_index(inplace=True, drop=True)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))

Training Data Size:  512
Test Data Size:  57


X = data_tr[['mean radius']].to_numpy()
Y = data_tr['malignant'].to_numpy()


plt.figure(figsize=(6, 6))
sns.regplot(data=data_tr, x='mean radius', y='malignant', 
              y_jitter = 0.1, 
              ci=False,
              line_kws={'color':'green'});
plt.ylim((-0.5, 1.5))
savefig("least_squares")


data_out = data_tr[['mean radius', 'malignant']].copy()

# replace a datapont with a big outlier
# (mean radius = 120, malignant = 1) 
data_out.iloc[0, :2] = [120, 1]
data_out.head()

lin_reg = LinearRegression()
lin_reg.fit(data_tr[['mean radius']], data_tr['malignant'])
m, b = lin_reg.coef_[0], lin_reg.intercept_
vert_x = (0.5 - b)/m


plt.figure(figsize=(6, 6))
sns.regplot(data = data_out, x = "mean radius", y = "malignant", 
              y_jitter = 0.1, 
              ci=False,
              line_kws={'color':'orange', 'label': 'w/outlier'},
           );
x_datapoints = np.array(plt.xlim())
plt.plot(x_datapoints, (x_datapoints)*m + b, color='green', label='original')
plt.ylim((-0.5, 1.5))
plt.xlim((6.67005, 29.13095))
plt.legend()
savefig("outlier")


# partition the training x data
bins = pd.cut(data_tr['mean radius'], 20) # Series of bin labels
data_tr['bin midpoint'] = [(b.left + b.right) / 2 for b in bins]

# then, get the average y per bin
avg_y_by_bin = data_tr.groupby('bin midpoint')['malignant'].mean()
avg_y_by_bin

bin midpoint
8.1915     0.000000
9.2225     0.000000
10.2435    0.000000
11.2645    0.057143
12.2855    0.094595
13.3065    0.200000
14.3275    0.333333
15.3485    0.674419
16.3695    0.750000
17.3900    0.944444
18.4105    1.000000
19.4315    1.000000
20.4525    1.000000
21.4735    1.000000
22.4945    1.000000
23.5155    1.000000
24.5365    1.000000
25.5575    1.000000
27.5995    1.000000
Name: malignant, dtype: float64


plt.figure(figsize=(6,6))
sns.regplot(data = data, x = "mean radius", y = "malignant", 
              y_jitter = 0.1, 
              ci=False,
              line_kws={'color':'green', 'label': 'OLS'},
           );

plt.plot(avg_y_by_bin, 'r', linewidth = 5, label='average y');
plt.ylim((-0.5, 1.5))
plt.legend()
savefig("graph_of_averages")


transform_df = pd.DataFrame({"x": avg_y_by_bin.index,
                        "p": avg_y_by_bin.values})

sns.lineplot(data=transform_df, x='x', y='p', color='r', linewidth = 5)
savefig('transform_1')


transform_df['odds'] = transform_df['p'] / (1 - transform_df['p'])

sns.lineplot(data=transform_df,
             x='x', y='odds', linewidth = 5)
savefig("transform_2")


transform_df['log-odds'] = np.log(transform_df['odds'])

sns.lineplot(data=transform_df,
             x='x', y='log-odds',
             color='g', linewidth = 5)
savefig("transform_3")

/opt/conda/lib/python3.9/site-packages/pandas/core/arraylike.py:364: RuntimeWarning:

divide by zero encountered in log


# the logistic function
# known as the sigmoid function in ML
def sigma(t):
    return 1 / (1 + np.exp(-t))


theta_0 = -13.8
theta_1 = 0.937


theta_0 = -13.8
theta_1 = 0.937

sns.lineplot(data=transform_df, x='x', y='p', color='r',
             linewidth = 5, label='empirical (true)')
x = transform_df['x']
pred_p = sigma(theta_0 + theta_1 * x)
plt.plot(x, pred_p, 'k--', linewidth = 5, label='predicted');
plt.xlabel('x');
plt.ylabel('P(Y = 1 | x)')
plt.legend()
savefig("overlay")


def flatten(li): 
    return [item for sub in li for item in sub]

bs = [-2, -1, -0.5, 2, 1, 0.5]
xs = np.linspace(-10, 10, 100)

fig, axes = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(10, 6))
for ax, b in zip(flatten(axes), bs):
    ys = sigma(xs * b)
    ax.plot(xs, ys)
    ax.set_title(r'$ \theta_1 = $' + str(b))

# add a big axes, hide frame
fig.add_subplot(111, frameon=False)
# hide tick and tick label of the big axes
plt.tick_params(labelcolor='none', top=False, bottom=False,
                left=False, right=False)
plt.grid(False)
plt.xlabel('$x$')
plt.ylabel(r'$ \frac{1}{1+\exp(-\theta_1 \cdot x)} $')
plt.tight_layout()
savefig('sigmoids.png')


def mse_loss_train_nobias(theta):
    x = data_tr['mean radius']
    y_obs = data_tr['malignant']
    y_hat = sigma(x * theta)
    return np.mean((y_hat - y_obs) ** 2)


thetas = np.linspace(-10, 10, 100)
plt.plot(thetas, [mse_loss_train_nobias(theta) for theta in thetas])
plt.ylabel('MSE')
plt.xlabel(r'$\theta$');


toy_df = pd.DataFrame({
        "x": [-4, -2, -0.5, 1, 3, 5],
        "y": [0, 0, 1, 0, 1, 1]
        #"x": [-9.5, -5.8, -4.0, 8.7, 12.0],
        #"y": [1, 0, 0, 1, 0]
})
toy_df.sort_values("x")


sns.scatterplot(data=toy_df, x='x', y='y', s=100)
plt.title("Toy classification data")
savefig("toy")


def mse_loss_toy_nobias(theta):
    p_hat = sigma(toy_df['x'] * theta)
    return np.mean((toy_df['y'] - p_hat)**2)

thetas = np.linspace(-10, 10, 100)
plt.plot(thetas, [mse_loss_toy_nobias(theta) for theta in thetas])
plt.title("MSE on toy classification data");
plt.xlabel(r'$\theta$')
plt.ylabel('MSE')
savefig("toy_mse")


best_theta = minimize(mse_loss_toy_nobias, x0 = 0)["x"][0]
best_theta

0.5446601825581691


sns.scatterplot(data=toy_df, x='x', y='y', s=100, label='y')
xs = np.linspace(-10, 10, 100)
plt.plot(xs, sigma(xs * best_theta), color='orange', label=r'$\sigma(x^T \theta)$')
plt.xlabel('x')
plt.legend()
plt.title(r'$\hat{\theta} = $' + f"{best_theta:.4}")
savefig("toy_best_mse_1")


best_theta_2 = minimize(mse_loss_toy_nobias, x0 = -5)["x"][0]
best_theta_2

-10.343653061026611


sns.scatterplot(data=toy_df, x='x', y='y', s=100, label='y')
xs = np.linspace(-10, 10, 100)
plt.plot(xs, sigma(xs * best_theta_2), color='orange', label=r'$\sigma(x^T \theta)$')
plt.xlabel('x')
plt.legend()
plt.title(r'$\hat{\theta} = $' + f"{best_theta_2:.4}")
savefig("toy_best_mse_2")


p_hat = np.arange(0.001, 0.999, 0.01)
loss = (1 - p_hat)**2
plt.plot(p_hat, loss, color='k')
plt.xlabel(r'$\sigma({x^T \theta})$')
plt.ylabel(r'$(1 - \hat{y})^2$')
plt.title('Squared Loss for One Individual');
savefig("mse_individual")


p_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(p_hat)
plt.plot(p_hat, loss, color='k')
plt.xlabel('$p$: Probability that y is 1')
plt.ylabel('$-\log(p)$')
plt.title('Cross-Entropy Loss for one observation when $y = 1$');
savefig("entropy_y1")


p_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(1 - p_hat)
plt.plot(p_hat, loss, color='k')
plt.xlabel('$p$: Probability that y is 1')
plt.ylabel('$-\log(1 - p)$')
plt.title('Cross-Entropy Loss for one observation when $y = 0$');
savefig("entropy_y0")


def cross_entropy(y, phat):
    return - y * np.log(phat) - (1 - y) * np.log(1 - phat)


def mce_loss_toy_nobias(theta):
    p_hat = sigma(toy_df['x'] * theta)
    return np.mean(cross_entropy(toy_df['y'], p_hat))


thetas = np.linspace(-4, 4, 100)
plt.plot(thetas, [mce_loss_toy_nobias(theta) for theta in thetas], color = 'green')
plt.ylabel(r'Mean Cross-Entropy($\theta$)')
plt.xlabel(r'$\theta$');
savefig("toy_mce")

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	0.2419	0.07871	...	25.380	17.33	184.60	2019.0	0.16220	0.66560	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	0.1812	0.05667	...	24.990	23.41	158.80	1956.0	0.12380	0.18660	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	0.2069	0.05999	...	23.570	25.53	152.50	1709.0	0.14440	0.42450	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	0.2597	0.09744	...	14.910	26.50	98.87	567.7	0.20980	0.86630	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	0.1809	0.05883	...	22.540	16.67	152.20	1575.0	0.13740	0.20500	0.4000	0.1625	0.2364	0.07678
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
564	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	0.1726	0.05623	...	25.450	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.2216	0.2060	0.07115
565	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	0.1752	0.05533	...	23.690	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.1628	0.2572	0.06637
566	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	0.1590	0.05648	...	18.980	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.1418	0.2218	0.07820
567	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	0.2397	0.07016	...	25.740	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.2650	0.4087	0.12400
568	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	0.1587	0.05884	...	9.456	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.0000	0.2871	0.07039

	mean radius	malignant
0	17.99	1
1	20.57	1
2	19.69	1
3	11.42	1
4	20.29	1
...	...	...
564	21.56	1
565	20.13	1
566	16.60	1
567	20.60	1
568	7.76	0

Lecture 21 – Data 100, Spring 2022¶

(Notebook setup) Obtaining the Data¶

The Prediction Task¶

Selecting Features¶

Preparing the Data Train-Test Split¶

Why Not Use Least-Squares Linear Regression?¶

Graph of Averages¶

1. Transform the y-axis non-linearly until the “S curve” looks linear.¶

2. Then, use math to invert all transformations.¶

The Logistic Function¶

Logistic Regression with Squared Loss¶

Toy Dataset: L2 Loss¶

Motivating Cross-Entropy Loss¶

	x	y
0	-4.0	0
1	-2.0	0
2	-0.5	1
3	1.0	0
4	3.0	1
5	5.0	1