import numpy as np
import pandas as pd


import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


n = 50
np.random.seed(42)
x = np.random.randn(4*n, 2) + 3*np.tile([[1,1], [-1,1], [-1,-1], [1,-1]],(n, 1))
y = x[:,0]>0
data = pd.DataFrame(np.hstack([x,y[:,np.newaxis]]), columns=["X1", "X2", "Y"]).sample(frac=1)
pos_ind = data["Y"]==1.0
pos_scatter = go.Scatter(x=data.loc[pos_ind,"X1"], y=data.loc[pos_ind,"X2"],
                         mode="markers", marker_symbol="cross", name="Pos")
neg_scatter = go.Scatter(x=data.loc[~pos_ind,"X1"], y=data.loc[~pos_ind,"X2"], 
                         mode="markers", name="Neg")
go.Figure([pos_scatter, neg_scatter])


from sklearn.linear_model import LogisticRegression


model = LogisticRegression()
model.fit(data[["X1", "X2"]], data['Y'])

LogisticRegression()


def plot_predictions(predict_fn):
    u = np.linspace(-10, 10, 100)
    (x0, x1) = np.meshgrid(u,u)
    X = np.vstack([x0.flatten(), x1.flatten()]).T
    Y_hat = predict_fn(X)
    return go.Contour(x=X[:,0], y=X[:,1], z=Y_hat)


go.Figure([pos_scatter, neg_scatter, 
           plot_predictions(lambda X: model.predict_proba(X)[:,1])])


n = 50
np.random.seed(42)
x = np.random.randn(4*n, 2) + 3*np.tile([[1,1], [-1,1], [-1,-1], [1,-1]],(n, 1))
y = np.logical_xor(x[:,0]>0, x[:,1]>0)
data = pd.DataFrame(np.hstack([x,y[:,np.newaxis]]), columns=["X1", "X2", "Y"]).sample(frac=1)
pos_ind = data["Y"]==1.0
pos_scatter = go.Scatter(x=data.loc[pos_ind,"X1"], y=data.loc[pos_ind,"X2"],
                         mode="markers", marker_symbol="cross", name="Pos")
neg_scatter = go.Scatter(x=data.loc[~pos_ind,"X1"], y=data.loc[~pos_ind,"X2"], 
                         mode="markers", name="Neg")
go.Figure([pos_scatter, neg_scatter])


model = LogisticRegression()
model.fit(data[["X1", "X2"]], data['Y'])

LogisticRegression()


go.Figure([pos_scatter, neg_scatter, 
           plot_predictions(lambda X: model.predict_proba(X)[:,1])])


def quadrant_features(X):
    df = pd.DataFrame()
    df["quad1"] = (X[:, 0] > 0) & (X[:, 1] > 0)
    df["quad2"] = (X[:, 0] < 0) & (X[:, 1] > 0)
    df["quad3"] = (X[:, 0] < 0) & (X[:, 1] < 0)
    df["quad4"] = (X[:, 0] > 0) & (X[:, 1] < 0)
    return df.to_numpy()


model = LogisticRegression()
model.fit(quadrant_features(data[["X1", "X2"]].to_numpy()), data['Y'])

LogisticRegression()


go.Figure([pos_scatter, neg_scatter, 
           plot_predictions(lambda X: model.predict_proba(quadrant_features(X))[:,1])])


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam, SGD


tX = torch.from_numpy(data[["X1", "X2"]].to_numpy())
tY = torch.from_numpy(data["Y"].to_numpy()).long()
dataset = TensorDataset(tX, tY)
dataset[0]

(tensor([-1.5972,  1.5981], dtype=torch.float64), tensor(1))


def stochastic_gradient_descent(model, loss_fn, dataset, l2reg = 1e-5, lr=1.0, nepochs=200, batch_size=10):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    opt = Adam(model.parameters(), lr=lr, weight_decay=l2reg)
    for i in range(nepochs):
        for (x, y) in loader:
            loss = loss_fn(model(x), y)
            loss.backward()
            opt.step()
            model.zero_grad()


basicNN = nn.Sequential(
    nn.Linear(2,4), nn.Sigmoid(), nn.Linear(4,2) 
).double()

loss_fn = nn.CrossEntropyLoss()

stochastic_gradient_descent(basicNN, loss_fn, dataset, l2reg=1e-4, lr=.5)


def softmax_predict(model, npX):
    with torch.no_grad():
        return F.softmax(model.forward(torch.from_numpy(npX)), dim=1).numpy()


go.Figure([pos_scatter, neg_scatter, 
           plot_predictions(lambda X: softmax_predict(basicNN, X)[:,1])])


import torch


theta = torch.tensor([1.0], requires_grad=True, dtype=torch.float64)
theta

tensor([1.], dtype=torch.float64, requires_grad=True)


z = (1 - torch.log(1 + torch.exp(theta)))**2
z

tensor([0.0981], dtype=torch.float64, grad_fn=<PowBackward0>)


z.grad_fn

<PowBackward0 at 0x7fd1c84151c0>


z.grad_fn.next_functions

((<RsubBackward1 at 0x7fd1c84150a0>, 0),)


z.grad_fn.next_functions[0][0].next_functions

((<LogBackward at 0x7fd1c8415af0>, 0),)


# !pip install torchviz
# !brew install graphviz
from torchviz import make_dot
make_dot(z)


z.backward()


theta.grad

tensor([0.4580], dtype=torch.float64)


theta.grad.item()

0.4580252880326174


def z_derivative(theta):
    return -2 * (1 - np.log(1 + np.exp(theta))) * np.exp(theta) / (1. + np.exp(theta))


z_derivative(1.)

0.45802528803261744


import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
import matplotlib.pyplot as plt
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')


training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)


labels_map = {
    0: "T-Shirt",
    1: "Trouser",
    2: "Pullover",
    3: "Dress",
    4: "Coat",
    5: "Sandal",
    6: "Shirt",
    7: "Sneaker",
    8: "Bag",
    9: "Ankle Boot",
}
figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(training_data), size=(1,)).item()
    img, label = training_data[sample_idx]
    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label])
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()


# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)

for X, y in train_dataloader:
    print("Shape of X [N, C, H, W]: ", X.shape)
    print("Shape of y: ", y.shape, y.dtype)
    break

Shape of X [N, C, H, W]:  torch.Size([64, 1, 28, 28])
Shape of y:  torch.Size([64]) torch.int64


class FashionCNN(nn.Module):
    
    def __init__(self):
        super(FashionCNN, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.fc1 = nn.Linear(in_features=64*6*6, out_features=600)
        self.drop = nn.Dropout2d(0.25)
        self.fc2 = nn.Linear(in_features=600, out_features=120)
        self.fc3 = nn.Linear(in_features=120, out_features=10)
        
    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.view(out.size(0), -1)
        out = self.fc1(out)
        out = self.drop(out)
        out = self.fc2(out)
        out = F.relu(out)
        out = self.fc3(out)
        
        return out


model = FashionCNN()
model.to(device)

error = nn.CrossEntropyLoss()

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
print(model)

FashionCNN(
  (layer1): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc1): Linear(in_features=2304, out_features=600, bias=True)
  (drop): Dropout2d(p=0.25, inplace=False)
  (fc2): Linear(in_features=600, out_features=120, bias=True)
  (fc3): Linear(in_features=120, out_features=10, bias=True)
)


def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test(dataloader, model):
    size = len(dataloader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    model.train()
    test_loss /= size
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


loss_fn = nn.CrossEntropyLoss()

epochs = 2
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model)
print("Done!")

Epoch 1
-------------------------------
loss: 2.332163  [    0/60000]
loss: 0.415294  [ 6400/60000]
loss: 0.201616  [12800/60000]
loss: 0.523889  [19200/60000]
loss: 0.402491  [25600/60000]
loss: 0.437899  [32000/60000]
loss: 0.241790  [38400/60000]
loss: 0.527170  [44800/60000]
loss: 0.332533  [51200/60000]
loss: 0.238288  [57600/60000]
Test Error: 
 Accuracy: 88.1%, Avg loss: 0.005095 

Epoch 2
-------------------------------
loss: 0.114494  [    0/60000]
loss: 0.383726  [ 6400/60000]
loss: 0.148336  [12800/60000]
loss: 0.298600  [19200/60000]
loss: 0.400109  [25600/60000]
loss: 0.375320  [32000/60000]
loss: 0.182450  [38400/60000]
loss: 0.407039  [44800/60000]
loss: 0.264054  [51200/60000]
loss: 0.195068  [57600/60000]
Test Error: 
 Accuracy: 89.8%, Avg loss: 0.004565 

Done!


error_images = torch.Tensor()
error_labels = torch.Tensor()
error_pred = torch.Tensor()
model.eval()    
with torch.no_grad():
    for X, y in test_dataloader:
        X, y = X.to(device), y.to(device)
        pred = model(X).argmax(1)
        errors = pred != y
        error_images = torch.cat([error_images, X[errors,:,:,:]])
        error_labels = torch.cat([error_labels, y[errors]])
        error_pred = torch.cat([error_pred, pred[errors]])


figure = plt.figure(figsize=(8, 8))
cols, rows = 3, 3
for i in range(1, cols * rows + 1):
    sample_idx = torch.randint(len(error_labels), size=(1,)).item()
    img = error_images[sample_idx]
    label = error_labels[sample_idx].item()
    pred = error_pred[sample_idx].item()
    figure.add_subplot(rows, cols, i)
    plt.title(labels_map[label] +", pred=" + labels_map[pred])
    plt.axis("off")
    plt.imshow(img.squeeze(), cmap="gray")
plt.show()

A Brief Introduction to Deep Learning¶

Quick Review of Logistic Regression¶

Non-linearly Separable Data¶

Manual Feature Engineering¶

Deep Learning¶

The Basic Neuron¶

Building a Neural Network¶

Quick Introduction to Algorithmic Differentiation¶

Image Classification¶