PyTorch

6 minute read

Published:

This lesson covers PyTorch Tutorial, https://pytorch.org/tutorials/beginner/basics/intro.html

Build Neural Network

  • Neural networks comprise of layers/modules that perform operations on data
  • torch.nn namespace provides all the building blocks
  • Every module in PyTorch subclasses the nn.Module
  • A neural network is a module itself that consists of other modules (layers)
  • This nested structure allows for building and managing complex architectures easily
topic = "pytorch"
lesson = 5

from n import *
home, models_path = get_project_dir("FashionMNIST")
print(home)
/home/naneja/datasets/n/FashionMNIST
import os

import torch
from torch import nn
from torch.utils.data import DataLoader

from torchvision import datasets, transforms

Getting device for training

# Get device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print_(f"Using {device} device")

Using cuda device

Defining Class

  • We define our neural network by subclassing nn.Module, and initialize the neural network layers in \_\_init\_\_
  • Every nn.Module subclass implements the operations on input data in the forward method.
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        
        self.flatten = nn.Flatten()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
model = NeuralNetwork().to(device)
print(model)
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
  • To use the model, we pass it the input data
  • This executes the model’s forward, along with some background operations
  • Do not call model.forward() directly!
  • Calling the model on the input returns a 10-dimensional tensor with raw predicted values for each class
  • We get the prediction probabilities by passing it through an instance of the nn.Softmax module
X = torch.rand(1, 28, 28, device=device)

logits = model(X)
print_(logits)

pred_probab = nn.Softmax(dim=1)(logits)
print_(pred_probab)

y_pred = pred_probab.argmax(1)

print_(f"Predicted class: {y_pred}")

tensor([[-0.1029, 0.0946, 0.0751, -0.0528, -0.0809, -0.0680, 0.0466, -0.1100, 0.0981, 0.0557]], device = 'cuda:0', grad_fn = <AddmmBackward0>)

tensor([[0.0903, 0.1100, 0.1079, 0.0950, 0.0923, 0.0935, 0.1049, 0.0897, 0.1104, 0.1058]], device = 'cuda:0', grad_fn = <SoftmaxBackward0>)

Predicted class: tensor([8], device = 'cuda:0')

Model Layers

# sample minibatch of 3 images of size 28x28

input_image = torch.rand(3,28,28)

print(input_image.size())
torch.Size([3, 28, 28])

nn.Flatten

  • We initialize the nn.Flatten layer to convert each 2D 28x28 image into a contiguous array of 784 pixel values ( the minibatch dimension (at dim=0) is maintained)
flatten = nn.Flatten()

flat_image = flatten(input_image)

print_(flat_image.size())

torch.Size([3, 784])

nn.Linear

  • applies a linear transformation on the input using its stored weights and biases
input_image = torch.rand(3,28,28)

flat_image = nn.Flatten()(input_image)

layer1 = nn.Linear(in_features=28*28, out_features=20)


hidden1 = layer1(flat_image)

print_(f"input_image.shape={input_image.shape}")
print_(f"flat_image.shape={flat_image.shape}")
print_(f"hidden1.size()={hidden1.size()}")

input_image.shape = torch.Size([3, 28, 28])

flat_image.shape = torch.Size([3, 784])

hidden1.size() = torch.Size([3, 20])

nn.ReLU

  • non-linear activations are what create the complex mappings between the model’s inputs and outputs
  • applied after linear transformations to introduce nonlinearity
hidden1 = layer1(flat_image)

print_(f"Before ReLU: {hidden1}\n\n")

hidden1 = nn.ReLU()(hidden1)

print_(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 8.2914e-02, 6.5140e-01, -4.7872e-01, 4.2423e-01, 3.1512e-01, -4.6439e-01, -3.0369e-01, 3.0793e-01, -4.0984e-01, -5.4728e-02, -3.8561e-01, 4.6029e-01, -7.4019e-01, -4.5039e-01, -2.5392e-01, 3.7243e-01, -1.9209e-01, 2.3408e-01, -2.2391e-01, 6.9345e-02], [ 2.4752e-02, 6.1227e-01, -1.8831e-01, 2.3573e-01, -2.5565e-01, 7.4215e-05, -3.0860e-01, 1.2343e-01, -4.1598e-01, -2.2467e-02, -3.6999e-01, 6.1501e-01, -4.4200e-01, -3.9115e-01, 1.4411e-01, 4.6714e-03, 1.9971e-01, 1.3158e-01, -1.5105e-01, 1.5163e-01], [ 2.1116e-01, 2.4611e-01, -7.2255e-01, 5.3163e-01, 3.7469e-01, 9.7769e-03, -2.7197e-01, 2.4071e-01, -7.0653e-01, 4.4505e-02, -4.4111e-01, 1.3184e-01, -4.0745e-01, -4.7977e-01, -3.3783e-01, 8.8514e-03, -1.7382e-01, -2.4588e-01, -5.8584e-02, 1.9456e-01]], grad_fn = <AddmmBackward0>)

After ReLU: tensor([[8.2914e-02, 6.5140e-01, 0.0000e+00, 4.2423e-01, 3.1512e-01, 0.0000e+00, 0.0000e+00, 3.0793e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 4.6029e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 3.7243e-01, 0.0000e+00, 2.3408e-01, 0.0000e+00, 6.9345e-02], [2.4752e-02, 6.1227e-01, 0.0000e+00, 2.3573e-01, 0.0000e+00, 7.4215e-05, 0.0000e+00, 1.2343e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 6.1501e-01, 0.0000e+00, 0.0000e+00, 1.4411e-01, 4.6714e-03, 1.9971e-01, 1.3158e-01, 0.0000e+00, 1.5163e-01], [2.1116e-01, 2.4611e-01, 0.0000e+00, 5.3163e-01, 3.7469e-01, 9.7769e-03, 0.0000e+00, 2.4071e-01, 0.0000e+00, 4.4505e-02, 0.0000e+00, 1.3184e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 8.8514e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.9456e-01]], grad_fn = <ReluBackward0>)

nn.Sequential

  • nn.Sequential is an ordered container of modules
  • data is passed through all the modules in the same order as defined
  • You can use sequential containers to put together a quick network like seq_modules
flatten = nn.Flatten()

layer1 = nn.Linear(28*28, 20)

seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20, 10)
)

input_image = torch.rand(3,28,28)

logits = seq_modules(input_image)

print_(f"input_image.shape={input_image.shape}")
print_(f"logits = {logits.shape}")
print(logits)

input_image.shape = torch.Size([3, 28, 28])

logits = torch.Size([3, 10])

tensor([[-0.4244, -0.2037, -0.0259, -0.1199,  0.0630,  0.0659, -0.2121,  0.1854,
         -0.1691,  0.1373],
        [-0.5047, -0.0754, -0.2049, -0.0444,  0.1256,  0.1779, -0.0863,  0.3558,
         -0.2717, -0.0196],
        [-0.4483,  0.0025, -0.1424, -0.0711,  0.0945,  0.0415, -0.1040,  0.2683,
         -0.2485,  0.1202]], grad_fn=<AddmmBackward0>)

nn.Softmax

  • The last linear layer of the neural network returns logits - raw values in [-infty, infty]
  • Logits are passed to the nn.Softmax module
  • The logits are scaled to values [0, 1] representing the model’s predicted probabilities for each class
  • dim parameter indicates the dimension along which the values must sum to 1
x = [[1, 2], [3,4]]
x = torch.tensor(x, dtype=torch.float)
print_("x")
print(x)

x_prob_dim_0 = nn.Softmax(dim=0)(x)
print_("x_prob_dim_0")
print(x_prob_dim_0)

x_prob_dim_1 = nn.Softmax(dim=1)(x)
print_("x_prob_dim_1")
print(x_prob_dim_1)

x

tensor([[1., 2.],
        [3., 4.]])

x_prob_dim_0

tensor([[0.1192, 0.1192],
        [0.8808, 0.8808]])

x_prob_dim_1

tensor([[0.2689, 0.7311],
        [0.2689, 0.7311]])
softmax = nn.Softmax(dim=1)

pred_probab = softmax(logits)

print_("pred_probab")
print(pred_probab)

pred_probab

tensor([[0.0691, 0.0861, 0.1029, 0.0937, 0.1125, 0.1128, 0.0854, 0.1271, 0.0892,
         0.1212],
        [0.0621, 0.0954, 0.0838, 0.0984, 0.1167, 0.1229, 0.0944, 0.1469, 0.0784,
         0.1009],
        [0.0659, 0.1034, 0.0894, 0.0960, 0.1133, 0.1075, 0.0929, 0.1348, 0.0804,
         0.1163]], grad_fn=<SoftmaxBackward0>)

Model Parameters

  • Many layers inside a neural network are parameterized, i.e. have associated weights and biases that are optimized during training
  • Subclassing nn.Module automatically tracks all fields defined inside your model object, and makes all parameters accessible using your model’s parameters() or named_parameters() methods

model = NeuralNetwork().to(device)

print_("Model structure")
print(model)
print("************")

for name, param in model.named_parameters():
    print_(f"Layer: {name}")
    print_(f"Size: {param.size()}") # size
    print(f"Values : {param[:2]}")
    print("************")

Model structure

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
************

Layer: linear_relu_stack.0.weight

Size: torch.Size([512, 784])

Values : tensor([[-0.0114, -0.0242, -0.0245,  ..., -0.0102,  0.0090,  0.0155],
        [ 0.0260,  0.0192,  0.0096,  ..., -0.0271,  0.0137,  0.0001]],
       device='cuda:0', grad_fn=<SliceBackward0>)
************

Layer: linear_relu_stack.0.bias

Size: torch.Size([512])

Values : tensor([0.0316, 0.0061], device='cuda:0', grad_fn=<SliceBackward0>)
************

Layer: linear_relu_stack.2.weight

Size: torch.Size([512, 512])

Values : tensor([[-0.0117, -0.0304,  0.0036,  ..., -0.0421,  0.0010, -0.0002],
        [ 0.0037, -0.0117, -0.0050,  ..., -0.0013,  0.0095,  0.0117]],
       device='cuda:0', grad_fn=<SliceBackward0>)
************

Layer: linear_relu_stack.2.bias

Size: torch.Size([512])

Values : tensor([-0.0376, -0.0053], device='cuda:0', grad_fn=<SliceBackward0>)
************

Layer: linear_relu_stack.4.weight

Size: torch.Size([10, 512])

Values : tensor([[-0.0398,  0.0318,  0.0068,  ...,  0.0177,  0.0350,  0.0077],
        [-0.0229, -0.0109,  0.0351,  ...,  0.0355,  0.0282,  0.0047]],
       device='cuda:0', grad_fn=<SliceBackward0>)
************

Layer: linear_relu_stack.4.bias

Size: torch.Size([10])

Values : tensor([0.0382, 0.0035], device='cuda:0', grad_fn=<SliceBackward0>)
************