import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from scipy.optimize import minimize

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV


# formatting options

# big font helper
def adjust_fontsize(size=None):
    SMALL_SIZE = 8
    MEDIUM_SIZE = 10
    BIGGER_SIZE = 12
    if size != None:
        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
    plt.rcParams['font.size'] = SMALL_SIZE
    plt.rcParams['axes.titlesize'] = SMALL_SIZE
    plt.rcParams['axes.labelsize'] = MEDIUM_SIZE
    plt.rcParams['xtick.labelsize'] = SMALL_SIZE
    plt.rcParams['ytick.labelsize'] = SMALL_SIZE
    plt.rcParams['legend.fontsize'] = SMALL_SIZE
    plt.rcParams['figure.titlesize'] = BIGGER_SIZE
    # plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
    # plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
    # plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
    # plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    # plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
    # plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
    # plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
def savefig(fname):
    if not SAVE_FIGURES_FLAG:
        # avoid memory overload
        return
    
    if not os.path.exists("images"):
        os.mkdir("images")
    fig = plt.gcf()
    fig.patch.set_alpha(0.0)
    plt.savefig(f"images/{fname}.png", bbox_inches = 'tight');
    
    
#plt.rcParams['figure.figsize'] = (4, 4)
#plt.rcParams['figure.dpi'] = 150
plt.rcParams['lines.linewidth'] = 3

plt.style.use('fivethirtyeight')
sns.set_context("talk")
sns.set_theme()
#sns.set()
adjust_fontsize(20)


SAVE_FIGURES_FLAG = False


import sklearn.datasets

# load the data

data_dict = sklearn.datasets.load_breast_cancer()
data = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])
# Target data_dict['target'] = 0 is malignant 1 is benign
data['malignant'] = (data_dict['target'] == 0).astype(int)

# split the data
from sklearn.model_selection import train_test_split
data_tr, data_te = train_test_split(data, test_size=0.10, random_state=42)
data_tr.reset_index(inplace=True, drop=True)
data_te.reset_index(inplace=True, drop=True)
print("Training Data Size: ", len(data_tr))
print("Test Data Size: ", len(data_te))

# X, Y are training data
X = data_tr[['mean radius']].to_numpy()
Y = data_tr['malignant'].to_numpy()

Training Data Size:  512
Test Data Size:  57


# manual to allow for jitter
plt.figure(figsize=(4,4))
g = sns.JointGrid(data = data, x = "mean radius", y = "malignant")
g.plot_marginals(sns.histplot)
g.plot_joint(sns.stripplot,
             orient='h', order=[1, 0],
             color=sns.color_palette()[0])
(g.ax_joint).set_xticks([10, 15, 20, 25])
savefig("jitter")
plt.show()

<Figure size 400x400 with 0 Axes>


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(fit_intercept=True)
model.fit(X, Y); # X, Y are training data


model.intercept_, model.coef_

(array([-14.42394402]), array([[0.97889232]]))


model.predict_proba([[20]])

array([[0.00574364, 0.99425636]])


model.classes_

array([0, 1])


Prob_hat_one = model.predict_proba(X)[:, 1]
Prob_hat_one.shape

(512,)


plt.figure(figsize=(6,6))
sns.stripplot(x=X.squeeze(), y=Y, 
              jitter = 0.1, orient='h');
sns.lineplot(x= X.squeeze(), y=Prob_hat_one,
             color='green', linewidth=5, label=r'$\hat{P}_{\theta}(Y = 1 | x)$')
plt.gca().invert_yaxis()
plt.xlabel('x: mean radius')
plt.ylabel('y: class')
savefig("predict_prob")


model.predict([[20]])

array([1])


# in case you want to see all of the probabilities and predictions
def make_prediction_df(X, Y, model):
    # assume X has one feature and that model is already fit 
    Prob = model.predict_proba(X)
    Y_hat = model.predict(X)
    df = pd.DataFrame({"X": X.squeeze(),
                       "Y": Y,
                       "P(Y = 1 | x)": Prob[:,1],
                       "Y_hat": Y_hat})
    return df
    
predict_train_df = make_prediction_df(X, Y, model)
predict_train_df


toy_df = pd.DataFrame({"x": [-1, 1], "y": [1, 0]})
#plt.scatter(toy_df['x'], toy_df['y'], s=100);
sns.scatterplot(data=toy_df, x='x', y='y',
              s=100, legend=None);
# plt.yticks([0, 1]);


def toy_model(theta1, x):
    return 1/(1 + np.exp(-theta1 * x))

def mean_cross_entropy_loss_toy(theta1):
    # Here we use 1 - sigma(t) = sigma(-t) to improve numerical stability
    return - np.sum(toy_df['y'] * np.log(toy_model(theta1, toy_df['x'])) + \
                    (1-toy_df['y']) * np.log(toy_model(theta1, -toy_df['x'])))


thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [mean_cross_entropy_loss_toy(theta) for theta in thetas], color = 'green')
plt.ylabel(r'MCE($\theta$)')
plt.xlabel(r'$\theta$');
plt.title("No regularization")
savefig("toy_loss")


def mce_regularized_loss_single_arg_toy(theta, reg):
    return mce_loss_single_arg_toy(theta) + reg * theta**2 

def regularized_loss_toy(theta1, reg):
    return mean_cross_entropy_loss_toy(theta1) + reg * theta1**2

thetas = np.linspace(-30, 30, 100)
plt.plot(thetas, [regularized_loss_toy(theta, 0.1) for theta in thetas], color = 'green')
plt.ylabel(r'MCE($\theta$) + 0.1 $\theta^2$')
plt.xlabel(r'$\theta$');
plt.title(r"Mean Loss + L2 Regularization ($\lambda$ = 0.1)")
savefig("toy_loss_reg")


# change y_sep vs y_nosep
y_used = 'y_sep'

data_1d = pd.DataFrame(
    {"x": [-1, -.75, -.5, -.25, .3, .4, 1, 1.2, 3],
     "y_sep": [ 1,    1,   1,   1,   0,  0, 0, 0,   0],
     "y_nosep": [1,   0,   1,   0,   0,  0, 0, 0,   0]
    })

plt.figure(figsize=((6.1, 1.5)))
sns.scatterplot(data=data_1d, x='x', y=y_used,hue=y_used,
              s=100, edgecolor='k', linewidth=1, legend=None);
plt.ylim((-.3, 1.25))
plt.yticks([0, 1])
sns.rugplot(data=data_1d, x='x', hue=y_used, height = 0.1, legend=None, linewidth=4);
plt.ylabel("y")
savefig(f"1d_labels_{y_used}")


iris = sns.load_dataset('iris')

plt.figure(figsize=(6, 4))
# separable
sns.scatterplot(data = iris[iris['species'] != 'virginica'],
               x = 'petal_length',
               y = 'petal_width',
               hue = 'species', s=100);
plt.gca().legend_.set_title(None)
savefig("2d_sep")


# not separable
plt.figure(figsize=(6, 4))
sns.scatterplot(data = iris[iris['species'] != 'setosa'],
               x = 'petal_length',
               y = 'petal_width',
               palette=sns.color_palette()[1:3],
               hue = 'species', s=100);
plt.gca().legend_.set_title(None)
savefig("2d_nosep")


len(sns.color_palette()[1:] + [sns.color_palette()[0]])

10


iris = sns.load_dataset('iris')
iris['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)


iris['is_versicolor'] = (iris['species'] == 'versicolor').astype(int)


iris


cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']


from sklearn.model_selection import train_test_split


iris_train, iris_test = train_test_split(iris, test_size = 0.2)


iris_model_no_reg = LogisticRegression(penalty = 'none', solver = 'lbfgs')
iris_model_no_reg.fit(iris_train[cols], iris_train['is_versicolor'])

LogisticRegression(penalty='none')

LogisticRegression(penalty='none')


iris_model_no_reg.coef_

array([[ 0.31725338, -3.23742857,  0.81901076, -2.45756336]])


iris_model_no_reg.intercept_

array([6.91024654])


iris_model_reg = LogisticRegression(penalty = 'l2', solver = 'lbfgs')
iris_model_reg.fit(iris_train[cols], iris_train['is_versicolor'])

LogisticRegression()

LogisticRegression()


iris_model_reg.coef_

array([[ 0.04503933, -2.17556629,  0.47603956, -1.18916478]])


iris_model_reg.intercept_

array([5.17476961])


iris_model_no_reg.score(iris_train[cols], iris_train['is_versicolor'])

0.7416666666666667


iris_model_reg.score(iris_train[cols], iris_train['is_versicolor'])

0.7333333333333333


iris_model_no_reg.score(iris_test[cols], iris_test['is_versicolor'])

0.8


iris_model_reg.score(iris_test[cols], iris_test['is_versicolor'])

0.7333333333333333


# split the data
from sklearn.model_selection import train_test_split

# X, Y are training data
X = data_tr[['mean radius']].to_numpy()
Y = data_tr['malignant'].to_numpy()
X.shape, Y.shape

((512, 1), (512,))


from sklearn.linear_model import LogisticRegression

model = LogisticRegression(fit_intercept=True)
model.fit(X, Y); # X, Y are training data


def accuracy(X, Y):
    return np.mean(model.predict(X) == Y)

accuracy(X, Y)

0.869140625


model.score(X, Y)

0.869140625


from sklearn.metrics import confusion_matrix

# Be careful – confusion_matrix takes in y_true as the first parameter and y_pred as the second.
# Don't mix these up!
cm = confusion_matrix(Y, model.predict(X))
cm

array([[294,  23],
       [ 44, 151]])


cm = confusion_matrix(Y, model.predict(X))
plt.figure(figsize=(4,4))
sns.heatmap(cm, annot=True, fmt = 'd', cmap = 'Blues', annot_kws = {'size': 16})
plt.xlabel('Predicted')
plt.ylabel('Actual');
savefig("cm")


Y_hat = model.predict(X)
tp = np.sum((Y_hat == 1) & (Y == 1))
tn = np.sum((Y_hat == 0) & (Y == 0))

fp = np.sum((Y_hat == 1) & (Y == 0))
fn = np.sum((Y_hat == 0) & (Y == 1))
tp, tn, fp, fn

(151, 294, 23, 44)


cm # [tn, fp]
   # [fn, tp]

array([[294,  23],
       [ 44, 151]])


precision = tp / (tp + fp)
precision

0.867816091954023


recall = tp / (tp + fn)
recall

0.7743589743589744


Prob_hat_one = model.predict_proba(X)[:, 1]

plt.figure(figsize=(4,4))
sns.stripplot(x=X.squeeze(), y=Y, 
              jitter = 0.1, orient='h');
sns.lineplot(x= X.squeeze(), y=Prob_hat_one,
             color='green', linewidth=5, label=r'$\hat{P}_{\theta}(Y = 1 | x)$')
plt.gca().invert_yaxis()
plt.xlabel('x: mean radius')
plt.ylabel('y: class')
plt.title("True classes and model fit")
plt.legend().remove()
savefig("true")


# predictions as well
Prob_hat_one = model.predict_proba(X)[:, 1]
Y_hat = model.predict(X) # threshold T = 0.5

plt.figure(figsize=(4,4))
sns.stripplot(x=X.squeeze(), y=Y_hat, 
              jitter = 0.1, orient='h', facecolors=None, linewidth=1);
sns.lineplot(x= X.squeeze(), y=Prob_hat_one,
             color='green', linewidth=5, label=r'$\hat{P}_{\theta}(Y = 1 | x)$')
plt.gca().invert_yaxis()
plt.axhline(y=0.5, label='T = 0.5')
plt.xlabel('x: mean radius')
plt.ylabel(r'$\hat{y}$: predicted class')
plt.legend().remove()
plt.title("Predicted classes if T = 0.5")
savefig("predict_05")


bc_model = model        # fit to breast cancer dataset
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def accuracy_threshold(X, Y, T):
    return np.mean(predict_threshold(bc_model, X, T) == Y)


# compute accuracies for different thresholds on train set
thresholds = np.linspace(0, 1, 100)
accs = [accuracy_threshold(X, Y, t) for t in thresholds]


fig = px.line(x=thresholds, y=accs, title="Train Accuracy vs. Threshold")
fig.update_xaxes(title="threshold")
fig.update_yaxes(title="Accuracy")


# the threshold that maximizes accuracy
np.argmax(accs)

57


from sklearn import metrics

# used for sklearn's cross_val_score
def make_scorer(metric, T):
    return lambda model, X, Y: metric(Y, predict_threshold(model, X, T)) 

def acc_scorer(T):
    return make_scorer(metrics.accuracy_score, T)


from sklearn.model_selection import cross_val_score
cv_accs = [
    np.mean(cross_val_score(bc_model, X, Y, 
                            scoring=acc_scorer(t), 
                            cv=5))
    for t in thresholds
]


fig = px.line(x=thresholds, y=cv_accs, title="Cross-Validated Accuracy vs. Threshold")
fig.update_xaxes(title="threshold")
fig.update_yaxes(title="Accuracy")


# the threshold that maximizes cross-validation accuracy
np.argmax(cv_accs)

56


bc_model = model        # fit to breast cancer dataset
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)


def tpr_threshold(X, Y, T): # this is recall
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def fpr_threshold(X, Y, T):
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 0)) / np.sum(Y == 0)


# compute for different thresholds on train set
thresholds = np.linspace(0, 1, 100)
tprs = [tpr_threshold(X, Y, t) for t in thresholds]
fprs = [fpr_threshold(X, Y, t) for t in thresholds]


fig = go.Figure()
fig.add_trace(go.Scatter(name = 'TPR', x = thresholds, y = tprs))
fig.add_trace(go.Scatter(name = 'FPR', x = thresholds, y = fprs))
fig.update_xaxes(title="Threshold")
fig.update_yaxes(title="Proportion")


fig = px.line(x=fprs, y = tprs, hover_name=thresholds, title="ROC Curve")
fig.update_xaxes(title="False Positive Rate")
fig.update_yaxes(title="True Positive Rate")
fig


bc_model = model        # fit to breast cancer dataset
def predict_threshold(model, X, T): 
    prob_one = model.predict_proba(X)[:, 1]
    return (prob_one >= T).astype(int)

def precision_threshold(X, Y, T):
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y_hat)

def recall_threshold(X, Y, T): # this is recall
    Y_hat = predict_threshold(bc_model, X, T)
    return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y)


def precision_recall_curve(Y, prob):
    unique_thresh = np.unique(prob)
    precision = [precision_at_threshold(Y, prob, t) for t in unique_thresh]
    recall = [recall_at_threshold(Y, prob, t) for t in unique_thresh]
    return precision, recall, unique_thresh


fig = px.line(x=recs, y=precs, hover_name=thresholds)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")


from sklearn.metrics import precision_recall_curve


precision, recall, threshold = precision_recall_curve(games['WON'], 
                                                              better_model.predict_proba(games[['FG_PCT_DIFF', 'PF_DIFF']])[:, 1])


fig = px.line(x=recall[:-1], y = precision[:-1], hover_name=threshold)
fig.update_xaxes(title="Recall")
fig.update_yaxes(title="Precision")
fig


from sklearn.metrics import roc_curve


fpr, tpr, threshold = roc_curve(games['WON'], 
                               better_model.predict_proba(games[['FG_PCT_DIFF', 'PF_DIFF']])[:, 1])


fig = px.line(x=fpr, y = tpr, hover_name=threshold)
fig.update_xaxes(title="False Positive Rate")
fig.update_yaxes(title="True Positive Rate")
fig


fpr, tpr, threshold = roc_curve(games['WON'], np.random.uniform(0, 1, len(games['WON'])))


fig = px.line(x=fpr, y = tpr, hover_name=threshold)
fig.update_xaxes(title="False Positive Rate")
fig.update_yaxes(title="True Positive Rate")
fig

	X	Y	P(Y = 1 \| x)	Y_hat
0	25.220	1	0.999965	1
1	13.480	1	0.226448	0
2	11.290	0	0.033174	0
3	12.860	0	0.137598	0
4	19.690	1	0.992236	1
...	...	...	...	...
507	8.888	0	0.003257	0
508	11.640	0	0.046105	0
509	14.290	0	0.392796	0
510	13.980	1	0.323216	0
511	12.180	0	0.075786	0

	sepal_length	sepal_width	petal_length	petal_width	species	is_versicolor
0	5.1	3.5	1.4	0.2	setosa	0
1	4.9	3.0	1.4	0.2	setosa	0
2	4.7	3.2	1.3	0.2	setosa	0
3	4.6	3.1	1.5	0.2	setosa	0
4	5.0	3.6	1.4	0.2	setosa	0
...	...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica	0
146	6.3	2.5	5.0	1.9	virginica	0
147	6.5	3.0	5.2	2.0	virginica	0
148	6.2	3.4	5.4	2.3	virginica	0
149	5.9	3.0	5.1	1.8	virginica	0

Lecture 22 – Data 100, Spring 2022¶

(Notebook setup) Obtaining the Data¶

sklearn¶

Fit¶

Prediction¶

Linear Separability and the Need for Regularization¶

Linearly separable plots¶

Regularization Demo¶

Performance Metric¶

Confusion matrix¶

Precision and Recall¶

Adjusting the Classification Threshold¶

Choosing a Threshold¶

Accuracy¶

ROC Curves¶

[Extra] Precision-Recall Curves¶