import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from scipy.optimize import minimize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV


# Formatting options

# Big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
    plt.rcParams['font.size'] = SMALL_SIZE
    plt.rcParams['axes.titlesize'] = SMALL_SIZE
    plt.rcParams['axes.labelsize'] = MEDIUM_SIZE
    plt.rcParams['xtick.labelsize'] = SMALL_SIZE
    plt.rcParams['ytick.labelsize'] = SMALL_SIZE
    plt.rcParams['legend.fontsize'] = SMALL_SIZE
    plt.rcParams['figure.titlesize'] = BIGGER_SIZE
    # plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    # plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    # plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    # plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    # plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    # plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    # plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
def savefig(fname):
    if not SAVE_FIGURES_FLAG:
        # Avoid memory overload
        return
    
    if not os.path.exists("images"):
        os.mkdir("images")
    fig = plt.gcf()
    fig.patch.set_alpha(0.0)
    plt.savefig(f"images/{fname}.png", bbox_inches = 'tight');
    
plt.rcParams['lines.linewidth'] = 3

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
#sns.set()
adjust_fontsize(20)


SAVE_FIGURES_FLAG = False


import sklearn.datasets

# Load the data
data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0).astype(int)

# Split the data
from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.10, random_state=42)
data_tr.reset_index(inplace=True, drop=True)
data_te.reset_index(inplace=True, drop=True)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))

# X, Y are training data
X = data_tr[['mean radius']].to_numpy()
Y = data_tr['malignant'].to_numpy()

Training Data Size:  512
Test Data Size:  57


# Manual to allow for jitter
plt.figure(figsize=(4,4))
g = sns.JointGrid(data = data, x = "mean radius", y = "malignant")
g.plot_marginals(sns.histplot)
g.plot_joint(sns.stripplot,
             orient='h', order=[1, 0],
             color=sns.color_palette()[0])
(g.ax_joint).set_xticks([10, 15, 20, 25])
plt.show()

<Figure size 400x400 with 0 Axes>


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(fit_intercept=True)
model.fit(X, Y); # X, Y are training data


model.intercept_, model.coef_

(array([-14.42394402]), array([[0.97889232]]))


model.predict_proba([[20]])

array([[0.00574364, 0.99425636]])


model.classes_

array([0, 1])


Prob_hat_one = model.predict_proba(X)[:, 1]
print(Prob_hat_one.shape)

(512,)


predictions = model.predict(X)
print(predictions.shape)

(512,)


plt.figure(figsize=(6,6))

# Getting model predictions
y_pred = model.predict(X)

# Setting colors based on correct and incorrect y_pred
color_palette = {False:'red', True:'green'}
sns.stripplot(x=X.squeeze(), y=data_tr['malignant'], 
              jitter = 0.1, orient='h', hue= predictions==data_tr['malignant'], 
              palette = color_palette);

plt.gca().invert_yaxis()
plt.xlabel('x: mean radius')
plt.ylabel('y: class')
plt.xlim((7, 30))

# Plotting the sigmoig function with the given parameters
f = sns.lineplot(x= X.squeeze(), y=model.predict_proba(X)[:,1],
                 color='blue', linewidth=3,
                 label= r'$\hat{P}$' + r'$(Y=1|{0} + {1}*x)$'.format(model.intercept_[0],model.coef_[0]))
plt.setp(f.get_legend().get_texts(), fontsize='10');


model.predict([[20]])

array([1])


# In case you want to see all of the probabilities and predictions
def make_prediction_df(X, Y, model):
    # Assume X has one feature and that model is already fit 
    Prob = model.predict_proba(X)
    Y_hat = model.predict(X)
    df = pd.DataFrame({"X": X.squeeze(),
                       "Y": Y,
                       "P(Y = 1 | x)": Prob[:,1],
                       "Y_hat": Y_hat})
    return df
    
predict_train_df = make_prediction_df(X, Y, model)
predict_train_df


toy_df = pd.DataFrame({"x": [-1, 1], "y": [1, 0]})
#plt.scatter(toy_df['x'], toy_df['y'], s=100);
sns.scatterplot(data=toy_df, x='x', y='y',
              s=100, legend=None);


def toy_model(theta1, x):
    return 1/(1 + np.exp(-theta1 * x))

def mean_cross_entropy_loss_toy(theta1):
    # Here we use 1 - sigma(z) = sigma(-z) to improve numerical stability
    return - np.sum(toy_df['y'] * np.log(toy_model(theta1, toy_df['x'])) + \
                    (1-toy_df['y']) * np.log(toy_model(theta1, -toy_df['x'])))


thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [mean_cross_entropy_loss_toy(theta) for theta in thetas], color = 'green')
plt.ylabel(r'MCE($\theta$)')
plt.xlabel(r'$\theta$');
plt.title("No regularization")

Text(0.5, 1.0, 'No regularization')


def mce_regularized_loss_single_arg_toy(theta, reg):
    return mce_loss_single_arg_toy(theta) + reg * theta**2 

def regularized_loss_toy(theta1, reg):
    return mean_cross_entropy_loss_toy(theta1) + reg * theta1**2

thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [regularized_loss_toy(theta, 0.1) for theta in thetas], color = 'green')
plt.ylabel(r'MCE($\theta$) + 0.1 $\theta^2$')
plt.xlabel(r'$\theta$');
plt.title(r"Mean Loss + L2 Regularization ($\lambda$ = 0.1)")

Text(0.5, 1.0, 'Mean Loss + L2 Regularization ($\\lambda$ = 0.1)')


# Change y_sep vs y_nosep
y_used = 'y_sep'

data_1d = pd.DataFrame(
    {"x": [-1, -.75, -.5, -.25, .3, .4, 1, 1.2, 3],
     "y_sep": [ 1,    1,   1,   1,   0,  0, 0, 0,   0],
     "y_nosep": [1,   0,   1,   0,   0,  0, 0, 0,   0]
    })

plt.figure(figsize=((6.1, 1.5)))
sns.scatterplot(data=data_1d, x='x', y=y_used,hue=y_used,
              s=100, edgecolor='k', linewidth=1, legend=None);
plt.ylim((-.3, 1.25))
plt.yticks([0, 1])
sns.rugplot(data=data_1d, x='x', hue=y_used, height = 0.1, legend=None, linewidth=4);
plt.ylabel("y");


iris = sns.load_dataset('iris')

plt.figure(figsize=(6, 4))
# Separable
sns.scatterplot(data = iris[iris['species'] != 'virginica'],
               x = 'petal_length',
               y = 'petal_width',
               hue = 'species', s=100);
plt.gca().legend_.set_title(None)


# Not separable
plt.figure(figsize=(6, 4))
sns.scatterplot(data = iris[iris['species'] != 'setosa'],
               x = 'petal_length',
               y = 'petal_width',
               palette=sns.color_palette()[1:3],
               hue = 'species', s=100);
plt.gca().legend_.set_title(None)


iris = sns.load_dataset('iris')
iris['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)


iris['is_versicolor'] = (iris['species'] == 'versicolor').astype(int)


iris


cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']


from sklearn.model_selection import train_test_split


import random

random.seed(25)
iris_train, iris_test = train_test_split(iris, test_size = 0.2)


iris_model_no_reg = LogisticRegression(penalty = 'none', solver = 'lbfgs')
iris_model_no_reg.fit(iris_train[cols], iris_train['is_versicolor'])

LogisticRegression(penalty='none')

LogisticRegression(penalty='none')


iris_model_no_reg.coef_

array([[-0.07871677, -2.77933453,  1.20743341, -2.84432124]])


iris_model_no_reg.intercept_

array([6.81212221])


iris_model_reg = LogisticRegression(penalty = 'l2', solver = 'lbfgs')
iris_model_reg.fit(iris_train[cols], iris_train['is_versicolor'])

LogisticRegression()

LogisticRegression()


iris_model_reg.coef_

array([[-0.10033813, -1.94462039,  0.58803632, -1.18201488]])


iris_model_reg.intercept_

array([4.86564829])


iris_model_no_reg.score(iris_train[cols], iris_train['is_versicolor'])

0.7166666666666667


iris_model_reg.score(iris_train[cols], iris_train['is_versicolor'])

0.6666666666666666


iris_model_no_reg.score(iris_test[cols], iris_test['is_versicolor'])

0.8333333333333334


iris_model_reg.score(iris_test[cols], iris_test['is_versicolor'])

0.8666666666666667


# Split the data
from sklearn.model_selection import train_test_split

# X, Y are training data
X = data_tr[['mean radius']].to_numpy()
Y = data_tr['malignant'].to_numpy()
X.shape, Y.shape

((512, 1), (512,))


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(fit_intercept=True)
model.fit(X, Y); # X, Y are training data


def accuracy(X, Y):
    return np.mean(model.predict(X) == Y)

accuracy(X, Y)

0.869140625


model.score(X, Y)

0.869140625


from sklearn.metrics import confusion_matrix

# Be careful – confusion_matrix takes in y_true as the first parameter and y_pred as the second.
# Don't mix these up!
cm = confusion_matrix(Y, model.predict(X))
cm

array([[294,  23],
       [ 44, 151]])


cm = confusion_matrix(Y, model.predict(X))
plt.figure(figsize=(4,4))
sns.heatmap(cm, annot=True, fmt = 'd', cmap = 'Blues', annot_kws = {'size': 16})
plt.xlabel('Predicted')
plt.ylabel('Actual');
savefig("cm")


Y_hat = model.predict(X)
tp = np.sum((Y_hat == 1) & (Y == 1))
tn = np.sum((Y_hat == 0) & (Y == 0))

fp = np.sum((Y_hat == 1) & (Y == 0))
fn = np.sum((Y_hat == 0) & (Y == 1))
tp, tn, fp, fn

(151, 294, 23, 44)


cm # [tn, fp]
   # [fn, tp]

array([[294,  23],
       [ 44, 151]])


precision = tp / (tp + fp)
precision

0.867816091954023


recall = tp / (tp + fn)
recall

0.7743589743589744


Prob_hat_one = model.predict_proba(X)[:, 1]

plt.figure(figsize=(4,4))
sns.stripplot(x=X.squeeze(), y=Y, 
              jitter = 0.1, orient='h');
sns.lineplot(x= X.squeeze(), y=Prob_hat_one,
             color='green', linewidth=5, label=r'$\hat{P}_{\theta}(Y = 1 | x)$')
plt.gca().invert_yaxis()
plt.xlabel('x: mean radius')
plt.ylabel('y: class')
plt.title("True classes and model fit")
plt.legend().remove()
savefig("true")


def threshold_plot(t):
    preds = (model.predict_proba(X)[:,1] >= t)
    fig = plt.figure()
    color_palette = {False:'red', True:'green'}
    fig = sns.stripplot(x=X.squeeze(), y=preds, 
              jitter = 0.1, orient='h', hue= preds==Y, 
              palette = color_palette);
    plt.gca().invert_yaxis()
    plt.xlabel('x: mean radius')
    plt.ylabel(r'$\hat{y}$: predicted class')
    plt.xlim((7, 30))
    plt.axhline(y=t, label='T = {}'.format(t))
    f = sns.lineplot(x= X.squeeze(), y=Prob_hat_one,
                 color='blue', linewidth=3)
    plt.setp(f.get_legend().get_texts(), fontsize='10');
    plt.title("Predicted classes if T = {}".format(t))
    plt.legend().remove()
    

threshold_plot(0.25)
threshold_plot(0.5)
threshold_plot(0.75)


bc_model = model        # Fit to breast cancer dataset
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def accuracy_threshold(X, Y, T):
    return np.mean(predict_threshold(bc_model, X, T) == Y)


# Compute accuracies for different thresholds on train set
thresholds = np.linspace(0, 1, 100)
accs = [accuracy_threshold(X, Y, t) for t in thresholds]


fig = px.line(x=thresholds, y=accs, title="Train Accuracy vs. Threshold")
fig.update_xaxes(title="threshold")
fig.update_yaxes(title="Accuracy")


# The threshold that maximizes accuracy
np.argmax(accs)

57


from sklearn import metrics

# Used for sklearn's cross_val_score
def make_scorer(metric, T):
    return lambda model, X, Y: metric(Y, predict_threshold(model, X, T)) 

def acc_scorer(T):
    return make_scorer(metrics.accuracy_score, T)


from sklearn.model_selection import cross_val_score
cv_accs = [
    np.mean(cross_val_score(bc_model, X, Y, 
                            scoring=acc_scorer(t), 
                            cv=5))
    for t in thresholds
]


fig = px.line(x=thresholds, y=cv_accs, title="Cross-Validated Accuracy vs. Threshold")
fig.update_xaxes(title="threshold")
fig.update_yaxes(title="Accuracy")


# The threshold that maximizes cross-validation accuracy
np.argmax(cv_accs)

56


bc_model = model        # Fit to breast cancer dataset
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)


def tpr_threshold(X, Y, T): # This is recall
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def fpr_threshold(X, Y, T):
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 0)) / np.sum(Y == 0)


# Compute for different thresholds on train set
thresholds = np.linspace(0, 1, 100)
tprs = [tpr_threshold(X, Y, t) for t in thresholds]
fprs = [fpr_threshold(X, Y, t) for t in thresholds]


fig = go.Figure()
fig.add_trace(go.Scatter(name = 'TPR', x = thresholds, y = tprs))
fig.add_trace(go.Scatter(name = 'FPR', x = thresholds, y = fprs))
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Proportion")


fig = px.line(x=fprs, y = tprs, hover_name=thresholds, title="ROC Curve")
fig.update_xaxes(title="False Positive Rate")
fig.update_yaxes(title="True Positive Rate")
fig


bc_model = model        # Fit to breast cancer dataset
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def precision_threshold(X, Y, T):
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y_hat)

def recall_threshold(X, Y, T): # this is recall
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y)


# Compute for different thresholds on train set
thresholds = np.linspace(0, 1, 100)
recs = [precision_threshold(X, Y, t) for t in thresholds]
precs = [recall_threshold(X, Y, t) for t in thresholds]

/tmp/ipykernel_134/215190977.py:8: RuntimeWarning:

invalid value encountered in long_scalars


fig = px.line(x=recs, y=precs, hover_name=thresholds)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")


from sklearn.metrics import precision_recall_curve


precision, recall, threshold = precision_recall_curve(Y, bc_model.predict_proba(X)[:, 1])


fig = px.line(x=recall[:-1], y=precision[:-1], hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")

	X	Y	P(Y = 1 \| x)	Y_hat
0	25.220	1	0.999965	1
1	13.480	1	0.226448	0
2	11.290	0	0.033174	0
3	12.860	0	0.137598	0
4	19.690	1	0.992236	1
...	...	...	...	...
507	8.888	0	0.003257	0
508	11.640	0	0.046105	0
509	14.290	0	0.392796	0
510	13.980	1	0.323216	0
511	12.180	0	0.075786	0

	sepal_length	sepal_width	petal_length	petal_width	species	is_versicolor
0	5.1	3.5	1.4	0.2	setosa	0
1	4.9	3.0	1.4	0.2	setosa	0
2	4.7	3.2	1.3	0.2	setosa	0
3	4.6	3.1	1.5	0.2	setosa	0
4	5.0	3.6	1.4	0.2	setosa	0
...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica	0
146	6.3	2.5	5.0	1.9	virginica	0
147	6.5	3.0	5.2	2.0	virginica	0
148	6.2	3.4	5.4	2.3	virginica	0
149	5.9	3.0	5.1	1.8	virginica	0

Lecture 24 Supplemental Notebook¶

(Notebook setup) Obtaining the Data¶

sklearn¶

Fit¶

Prediction¶

Linear Separability and the Need for Regularization¶

Linearly separable plots¶

Regularization Demo¶

Performance Metric¶

Confusion matrix¶

Precision and Recall¶

Adjusting the Classification Threshold¶

Choosing a Threshold¶

Accuracy¶

ROC Curves¶

[Extra] Precision-Recall Curves¶