import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
# import cufflinks as cf
# cf.set_config_file(offline=True, sharing=False, theme='ggplot');

from scipy.optimize import minimize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV


# Formatting options

# Big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
    plt.rcParams['font.size'] = SMALL_SIZE
    plt.rcParams['axes.titlesize'] = SMALL_SIZE
    plt.rcParams['axes.labelsize'] = MEDIUM_SIZE
    plt.rcParams['xtick.labelsize'] = SMALL_SIZE
    plt.rcParams['ytick.labelsize'] = SMALL_SIZE
    plt.rcParams['legend.fontsize'] = SMALL_SIZE
    plt.rcParams['figure.titlesize'] = BIGGER_SIZE
    
    
def savefig(fname):
    if not SAVE_FIGURES_FLAG:
        # Avoid memory overload
        return
    
    if not os.path.exists("images"):
        os.mkdir("images")
    fig = plt.gcf()
    fig.patch.set_alpha(0.0)
    plt.savefig(f"images/{fname}.png", bbox_inches = 'tight');
    
plt.rcParams['lines.linewidth'] = 3

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
adjust_fontsize(20)


SAVE_FIGURES_FLAG = False


toy_df = pd.DataFrame({"x": [-1, -.75, -.5, -.25, .3, .4, 1, 1.2, 3],
                       "y": [ 0, 0, 0, 0, 1, 1, 1, 1, 1]})
toy_df.sort_values("x")


plt.scatter(data=toy_df, x='x', y='y', s=100);
plt.xlabel('Data')
plt.ylabel('Class');


color_condition = (((toy_df['x']<0) & (toy_df['y']==0)) | 
                   ((toy_df['x']>=0) & (toy_df['y']==1))).astype(int)
color = np.where((color_condition==False),'r', np.where((color_condition==True),'g', 'r'))
plt.scatter(data=toy_df, x='x', y='y', s=100, c=color)
plt.xlabel('Data')
plt.ylabel('Class')
plt.step([-3,0,4], [0, 1, 1], where='post', color='b')
plt.title("Binary Classification with Sign Function Decision Rule");


def odds(p):
    return p/(1-p)

p = np.linspace(0, 0.99, 1000)
odds_p = odds(p)
fig = px.line(x=p, y=odds_p, title="Odds Ratio")
fig.update_xaxes(title="Probability")
fig.update_yaxes(title="Odds Ratio")
fig.show()


logit = np.log(odds_p)
fig = px.line(x=p, y=logit, title="Logit function")
fig.update_xaxes(title="Probability")
fig.update_yaxes(title="Log of the odds ratio")

/tmp/ipykernel_430/1208200170.py:1: RuntimeWarning:

divide by zero encountered in log


def logistic(z):
    return 1/(1+np.exp(-z))

z = np.linspace(-20, 20, 1000)
sig = logistic(z)
fig = px.line(x=z, y=sig, title="Logistic function")
fig.update_xaxes(title="Classification Activation")
fig.update_yaxes(title="Logistic/Probability")
fig.show()


def sigmoid_plot(theta_0, theta_1, c):
    z = np.linspace(-5, 5, 100)
    f = sns.lineplot(x= z, y=logistic(theta_0 + theta_1 * z),
             color=c, linewidth=3,
             label= r'$\hat{P}$' + r'$(Y=1|{0} + {1}*x)$'.format(theta_0,theta_1));
    plt.setp(f.get_legend().get_texts(), fontsize='10') 


plt.figure(figsize=(6,6))
sns.scatterplot(data=toy_df, x='x', y='y', s=100)
plt.xlabel('x')
plt.ylabel('y: class')
plt.xlim((-5, 5))

sigmoid_plot(0, 1, 'green')
sigmoid_plot(1, 1, 'blue')
sigmoid_plot(0, 5, 'red')


import sklearn.datasets
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
data


# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0).astype(int)


data.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'malignant'],
      dtype='object')


data[['mean radius', 'malignant']]


sns.jointplot(data = data, x = "mean radius", y = "malignant");


# Manual to allow for jitter
g = sns.JointGrid(data = data, x = "mean radius", y = "malignant")
g.plot_marginals(sns.histplot)
g.plot_joint(sns.stripplot,
             orient='h', order=[1, 0],
             color=sns.color_palette()[0])
(g.ax_joint).set_xticks([10, 15, 20, 25])
plt.show()


sns.histplot(data = data, x = "mean radius", 
             hue = "malignant",
             binwidth=0.5,
             kde=True);
plt.legend(labels=["Malignant", "Benign"]);


from sklearn.model_selection import train_test_split

data_tr, data_te = train_test_split(data, test_size=0.10, random_state=42)
data_tr.reset_index(inplace=True, drop=True)
data_te.reset_index(inplace=True, drop=True)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))

Training Data Size:  512
Test Data Size:  57


# X, Y are training data
X = data_tr[['mean radius']].to_numpy()
Y = data_tr['malignant'].to_numpy()


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(fit_intercept=True)
model.fit(X, Y); # X, Y are training data


model.intercept_, model.coef_

(array([-14.42394402]), array([[0.97889232]]))


plt.figure(figsize=(6,6))
# Getting model parameters
theta_0 = model.intercept_[0]
theta_1 = model.coef_[0]

# Getting model predictions
y_pred = np.round(logistic(theta_0 + theta_1 * data_tr['mean radius']))

# Setting colors based on correct and incorrect y_pred
color_palette = {False:'red', True:'green'}
sns.stripplot(x=data_tr['mean radius'].squeeze(), y=data_tr['malignant'], 
              jitter = 0.1, orient='h', hue= y_pred==data_tr['malignant'], palette = color_palette);

plt.gca().invert_yaxis()
plt.xlabel('x: mean radius')
plt.ylabel('y: class')
plt.xlim((7, 30))

# Plotting the sigmoig function with the given parameters
z = np.linspace(5, 30, 100)
f = sns.lineplot(x= z, y=logistic(theta_0 + theta_1 * z),
                 color='blue', linewidth=3,
                 label= r'$\hat{P}$' + r'$(Y=1|{0} + {1}*x)$'.format(theta_0,theta_1))
plt.setp(f.get_legend().get_texts(), fontsize='10');


model.predict_proba([[20]])

array([[0.00574364, 0.99425636]])


model.classes_

array([0, 1])


Prob_hat_one = model.predict_proba(X)[:, 1]
Prob_hat_one.shape

(512,)


plt.figure(figsize=(6,6))
sns.stripplot(x=X.squeeze(), y=Y, 
              jitter = 0.1, orient='h');
sns.lineplot(x= X.squeeze(), y=Prob_hat_one,
             color='green', linewidth=5, label=r'$\hat{P}_{\theta}(Y = 1 | x)$')
plt.gca().invert_yaxis()
plt.xlabel('x: mean radius')
plt.ylabel('y: class')

Text(0, 0.5, 'y: class')


model.predict([[20]])

array([1])


def mse_loss_train_nobias(theta):
    x = data_tr['mean radius']
    y_obs = data_tr['malignant']
    y_hat = logistic(x * theta)
    return np.mean((y_hat - y_obs) ** 2)


thetas = np.linspace(-10, 10, 100)
plt.plot(thetas, [mse_loss_train_nobias(theta) for theta in thetas])
plt.ylabel('MSE')
plt.xlabel(r'$\theta$');


toy_df = pd.DataFrame({
        "x": [-4, -2, -0.5, 1, 3, 5],
        "y": [0, 0, 1, 0, 1, 1]
})
toy_df.sort_values("x")


sns.scatterplot(data=toy_df, x='x', y='y', s=100)
plt.title("Toy classification data")

Text(0.5, 1.0, 'Toy classification data')


def mse_loss_toy_nobias(theta):
    p_hat = logistic(toy_df['x'] * theta)
    return np.mean((toy_df['y'] - p_hat)**2)

thetas = np.linspace(-10, 10, 100)
plt.plot(thetas, [mse_loss_toy_nobias(theta) for theta in thetas])
plt.title("MSE on toy classification data");
plt.xlabel(r'$\theta$')
plt.ylabel('MSE')

Text(0, 0.5, 'MSE')


best_theta = minimize(mse_loss_toy_nobias, x0 = 0)["x"][0]
best_theta

0.5446601825581691


sns.scatterplot(data=toy_df, x='x', y='y', s=100, label='y')
xs = np.linspace(-10, 10, 100)
plt.plot(xs, logistic(xs * best_theta), color='orange', label=r'$\sigma(x^T \theta)$')
plt.xlabel('x')
plt.legend()
plt.title(r'$\hat{\theta} = $' + f"{best_theta:.4}");


best_theta_2 = minimize(mse_loss_toy_nobias, x0 = -5)["x"][0]
best_theta_2

-10.343653061026611


sns.scatterplot(data=toy_df, x='x', y='y', s=100, label='y')
xs = np.linspace(-10, 10, 100)
plt.plot(xs, logistic(xs * best_theta_2), color='orange', label=r'$\sigma(x^T \theta)$')
plt.xlabel('x')
plt.legend()
plt.title(r'$\hat{\theta} = $' + f"{best_theta_2:.4}")

Text(0.5, 1.0, '$\\hat{\\theta} = $-10.34')


p_hat = np.arange(0.001, 0.999, 0.01)
loss = (1 - p_hat)**2
plt.plot(p_hat, loss, color='k')
plt.xlabel(r'$\sigma({x^T \theta})$')
plt.ylabel(r'$(1 - \hat{y})^2$')
plt.title('Squared Loss for One Individual');


p_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(p_hat)
plt.plot(p_hat, loss, color='k')
plt.xlabel('$p$: Probability that y is 1')
plt.ylabel('$-\log(p)$')
plt.title('Cross-Entropy Loss for one observation when $y = 1$');


p_hat = np.arange(0.001, 0.999, 0.01)
loss = -np.log(1 - p_hat)
plt.plot(p_hat, loss, color='k')
plt.xlabel('$p$: Probability that y is 1')
plt.ylabel('$-\log(1 - p)$')
plt.title('Cross-Entropy Loss for one observation when $y = 0$');


def cross_entropy(y, phat):
    return - y * np.log(phat) - (1 - y) * np.log(1 - phat)


def mce_loss_toy_nobias(theta):
    p_hat = logistic(toy_df['x'] * theta)
    return np.mean(cross_entropy(toy_df['y'], p_hat))


thetas = np.linspace(-4, 4, 100)
plt.plot(thetas, [mce_loss_toy_nobias(theta) for theta in thetas], color = 'green')
plt.ylabel(r'Mean Cross-Entropy($\theta$)')
plt.xlabel(r'$\theta$');

	x	y
0	-1.00	0
1	-0.75	0
2	-0.50	0
3	-0.25	0
4	0.30	1
5	0.40	1
6	1.00	1
7	1.20	1
8	3.00	1

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst radius	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.30010	0.14710	0.2419	0.07871	...	25.380	17.33	184.60	2019.0	0.16220	0.66560	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.08690	0.07017	0.1812	0.05667	...	24.990	23.41	158.80	1956.0	0.12380	0.18660	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.19740	0.12790	0.2069	0.05999	...	23.570	25.53	152.50	1709.0	0.14440	0.42450	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.24140	0.10520	0.2597	0.09744	...	14.910	26.50	98.87	567.7	0.20980	0.86630	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.19800	0.10430	0.1809	0.05883	...	22.540	16.67	152.20	1575.0	0.13740	0.20500	0.4000	0.1625	0.2364	0.07678
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
564	21.56	22.39	142.00	1479.0	0.11100	0.11590	0.24390	0.13890	0.1726	0.05623	...	25.450	26.40	166.10	2027.0	0.14100	0.21130	0.4107	0.2216	0.2060	0.07115
565	20.13	28.25	131.20	1261.0	0.09780	0.10340	0.14400	0.09791	0.1752	0.05533	...	23.690	38.25	155.00	1731.0	0.11660	0.19220	0.3215	0.1628	0.2572	0.06637
566	16.60	28.08	108.30	858.1	0.08455	0.10230	0.09251	0.05302	0.1590	0.05648	...	18.980	34.12	126.70	1124.0	0.11390	0.30940	0.3403	0.1418	0.2218	0.07820
567	20.60	29.33	140.10	1265.0	0.11780	0.27700	0.35140	0.15200	0.2397	0.07016	...	25.740	39.42	184.60	1821.0	0.16500	0.86810	0.9387	0.2650	0.4087	0.12400
568	7.76	24.54	47.92	181.0	0.05263	0.04362	0.00000	0.00000	0.1587	0.05884	...	9.456	30.37	59.16	268.6	0.08996	0.06444	0.0000	0.0000	0.2871	0.07039

	mean radius	malignant
0	17.99	1
1	20.57	1
2	19.69	1
3	11.42	1
4	20.29	1
...	...	...
564	21.56	1
565	20.13	1
566	16.60	1
567	20.60	1
568	7.76	0

	x	y
0	-4.0	0
1	-2.0	0
2	-0.5	1
3	1.0	0
4	3.0	1
5	5.0	1

Lecture 23 Supplemental Notebook¶

Outline of a classification task¶

Probabilistic Modeling:¶

Deriving $P(Y = 1 | z)$¶

Odds¶

From $[0,1]\rightarrow [0, +\infty)$ to $(0,1) \rightarrow (-\infty, +\infty)$¶

Inversing the Logit function¶

Demo on the Wisconsin Breast Cancer Dataset¶

The Prediction Task¶

Selecting Features¶

Best model parameters¶

sklearn¶

Fit¶

Prediction¶

Logistic Regression with Squared Loss¶

Toy Dataset: L2 Loss¶

Motivating Cross-Entropy Loss¶