PyTorch
Published:
This lesson covers PyTorch Tutorial, https://pytorch.org/tutorials/beginner/basics/intro.html
Build Neural Network
- Neural networks comprise of layers/modules that perform operations on data
torch.nn
namespace provides all the building blocks- Every module in PyTorch subclasses the nn.Module
- A neural network is a module itself that consists of other modules (layers)
- This nested structure allows for building and managing complex architectures easily
topic = "pytorch"
lesson = 5
from n import *
home, models_path = get_project_dir("FashionMNIST")
print(home)
/home/naneja/datasets/n/FashionMNIST
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
Getting device for training
# Get device for training
device = "cuda" if torch.cuda.is_available() else "cpu"
print_(f"Using {device} device")
Using cuda device
Defining Class
- We define our neural network by subclassing nn.Module, and initialize the neural network layers in
\_\_init\_\_
- Every
nn.Module
subclass implements the operations on input data in theforward
method.
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10),
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
model = NeuralNetwork().to(device)
print(model)
NeuralNetwork(
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear_relu_stack): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
(4): Linear(in_features=512, out_features=10, bias=True)
)
)
- To use the model, we pass it the input data
- This executes the model’s forward, along with some background operations
- Do not call model.forward() directly!
- Calling the model on the input returns a 10-dimensional tensor with raw predicted values for each class
- We get the prediction probabilities by passing it through an instance of the
nn.Softmax
module
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
print_(logits)
pred_probab = nn.Softmax(dim=1)(logits)
print_(pred_probab)
y_pred = pred_probab.argmax(1)
print_(f"Predicted class: {y_pred}")
tensor([[-0.1029, 0.0946, 0.0751, -0.0528, -0.0809, -0.0680, 0.0466, -0.1100, 0.0981, 0.0557]], device = 'cuda:0', grad_fn = <AddmmBackward0>)
tensor([[0.0903, 0.1100, 0.1079, 0.0950, 0.0923, 0.0935, 0.1049, 0.0897, 0.1104, 0.1058]], device = 'cuda:0', grad_fn = <SoftmaxBackward0>)
Predicted class: tensor([8], device = 'cuda:0')
Model Layers
# sample minibatch of 3 images of size 28x28
input_image = torch.rand(3,28,28)
print(input_image.size())
torch.Size([3, 28, 28])
nn.Flatten
- We initialize the
nn.Flatten
layer to convert each 2D 28x28 image into a contiguous array of 784 pixel values ( the minibatch dimension (at dim=0) is maintained)
flatten = nn.Flatten()
flat_image = flatten(input_image)
print_(flat_image.size())
torch.Size([3, 784])
nn.Linear
- applies a linear transformation on the input using its stored weights and biases
input_image = torch.rand(3,28,28)
flat_image = nn.Flatten()(input_image)
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print_(f"input_image.shape={input_image.shape}")
print_(f"flat_image.shape={flat_image.shape}")
print_(f"hidden1.size()={hidden1.size()}")
input_image.shape = torch.Size([3, 28, 28])
flat_image.shape = torch.Size([3, 784])
hidden1.size() = torch.Size([3, 20])
nn.ReLU
- non-linear activations are what create the complex mappings between the model’s inputs and outputs
- applied after linear transformations to introduce nonlinearity
hidden1 = layer1(flat_image)
print_(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print_(f"After ReLU: {hidden1}")
Before ReLU: tensor([[ 8.2914e-02, 6.5140e-01, -4.7872e-01, 4.2423e-01, 3.1512e-01, -4.6439e-01, -3.0369e-01, 3.0793e-01, -4.0984e-01, -5.4728e-02, -3.8561e-01, 4.6029e-01, -7.4019e-01, -4.5039e-01, -2.5392e-01, 3.7243e-01, -1.9209e-01, 2.3408e-01, -2.2391e-01, 6.9345e-02], [ 2.4752e-02, 6.1227e-01, -1.8831e-01, 2.3573e-01, -2.5565e-01, 7.4215e-05, -3.0860e-01, 1.2343e-01, -4.1598e-01, -2.2467e-02, -3.6999e-01, 6.1501e-01, -4.4200e-01, -3.9115e-01, 1.4411e-01, 4.6714e-03, 1.9971e-01, 1.3158e-01, -1.5105e-01, 1.5163e-01], [ 2.1116e-01, 2.4611e-01, -7.2255e-01, 5.3163e-01, 3.7469e-01, 9.7769e-03, -2.7197e-01, 2.4071e-01, -7.0653e-01, 4.4505e-02, -4.4111e-01, 1.3184e-01, -4.0745e-01, -4.7977e-01, -3.3783e-01, 8.8514e-03, -1.7382e-01, -2.4588e-01, -5.8584e-02, 1.9456e-01]], grad_fn = <AddmmBackward0>)
After ReLU: tensor([[8.2914e-02, 6.5140e-01, 0.0000e+00, 4.2423e-01, 3.1512e-01, 0.0000e+00, 0.0000e+00, 3.0793e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 4.6029e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 3.7243e-01, 0.0000e+00, 2.3408e-01, 0.0000e+00, 6.9345e-02], [2.4752e-02, 6.1227e-01, 0.0000e+00, 2.3573e-01, 0.0000e+00, 7.4215e-05, 0.0000e+00, 1.2343e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 6.1501e-01, 0.0000e+00, 0.0000e+00, 1.4411e-01, 4.6714e-03, 1.9971e-01, 1.3158e-01, 0.0000e+00, 1.5163e-01], [2.1116e-01, 2.4611e-01, 0.0000e+00, 5.3163e-01, 3.7469e-01, 9.7769e-03, 0.0000e+00, 2.4071e-01, 0.0000e+00, 4.4505e-02, 0.0000e+00, 1.3184e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 8.8514e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.9456e-01]], grad_fn = <ReluBackward0>)
nn.Sequential
- nn.Sequential is an ordered container of modules
- data is passed through all the modules in the same order as defined
- You can use sequential containers to put together a quick network like seq_modules
flatten = nn.Flatten()
layer1 = nn.Linear(28*28, 20)
seq_modules = nn.Sequential(
flatten,
layer1,
nn.ReLU(),
nn.Linear(20, 10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)
print_(f"input_image.shape={input_image.shape}")
print_(f"logits = {logits.shape}")
print(logits)
input_image.shape = torch.Size([3, 28, 28])
logits = torch.Size([3, 10])
tensor([[-0.4244, -0.2037, -0.0259, -0.1199, 0.0630, 0.0659, -0.2121, 0.1854,
-0.1691, 0.1373],
[-0.5047, -0.0754, -0.2049, -0.0444, 0.1256, 0.1779, -0.0863, 0.3558,
-0.2717, -0.0196],
[-0.4483, 0.0025, -0.1424, -0.0711, 0.0945, 0.0415, -0.1040, 0.2683,
-0.2485, 0.1202]], grad_fn=<AddmmBackward0>)
nn.Softmax
- The last linear layer of the neural network returns logits - raw values in [-infty, infty]
- Logits are passed to the nn.Softmax module
- The logits are scaled to values [0, 1] representing the model’s predicted probabilities for each class
- dim parameter indicates the dimension along which the values must sum to 1
x = [[1, 2], [3,4]]
x = torch.tensor(x, dtype=torch.float)
print_("x")
print(x)
x_prob_dim_0 = nn.Softmax(dim=0)(x)
print_("x_prob_dim_0")
print(x_prob_dim_0)
x_prob_dim_1 = nn.Softmax(dim=1)(x)
print_("x_prob_dim_1")
print(x_prob_dim_1)
x
tensor([[1., 2.],
[3., 4.]])
x_prob_dim_0
tensor([[0.1192, 0.1192],
[0.8808, 0.8808]])
x_prob_dim_1
tensor([[0.2689, 0.7311],
[0.2689, 0.7311]])
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)
print_("pred_probab")
print(pred_probab)
pred_probab
tensor([[0.0691, 0.0861, 0.1029, 0.0937, 0.1125, 0.1128, 0.0854, 0.1271, 0.0892,
0.1212],
[0.0621, 0.0954, 0.0838, 0.0984, 0.1167, 0.1229, 0.0944, 0.1469, 0.0784,
0.1009],
[0.0659, 0.1034, 0.0894, 0.0960, 0.1133, 0.1075, 0.0929, 0.1348, 0.0804,
0.1163]], grad_fn=<SoftmaxBackward0>)
Model Parameters
- Many layers inside a neural network are parameterized, i.e. have associated weights and biases that are optimized during training
- Subclassing
nn.Module
automatically tracks all fields defined inside your model object, and makes all parameters accessible using your model’s parameters() or named_parameters() methods
model = NeuralNetwork().to(device)
print_("Model structure")
print(model)
print("************")
for name, param in model.named_parameters():
print_(f"Layer: {name}")
print_(f"Size: {param.size()}") # size
print(f"Values : {param[:2]}")
print("************")
Model structure
NeuralNetwork(
(flatten): Flatten(start_dim=1, end_dim=-1)
(linear_relu_stack): Sequential(
(0): Linear(in_features=784, out_features=512, bias=True)
(1): ReLU()
(2): Linear(in_features=512, out_features=512, bias=True)
(3): ReLU()
(4): Linear(in_features=512, out_features=10, bias=True)
)
)
************
Layer: linear_relu_stack.0.weight
Size: torch.Size([512, 784])
Values : tensor([[-0.0114, -0.0242, -0.0245, ..., -0.0102, 0.0090, 0.0155],
[ 0.0260, 0.0192, 0.0096, ..., -0.0271, 0.0137, 0.0001]],
device='cuda:0', grad_fn=<SliceBackward0>)
************
Layer: linear_relu_stack.0.bias
Size: torch.Size([512])
Values : tensor([0.0316, 0.0061], device='cuda:0', grad_fn=<SliceBackward0>)
************
Layer: linear_relu_stack.2.weight
Size: torch.Size([512, 512])
Values : tensor([[-0.0117, -0.0304, 0.0036, ..., -0.0421, 0.0010, -0.0002],
[ 0.0037, -0.0117, -0.0050, ..., -0.0013, 0.0095, 0.0117]],
device='cuda:0', grad_fn=<SliceBackward0>)
************
Layer: linear_relu_stack.2.bias
Size: torch.Size([512])
Values : tensor([-0.0376, -0.0053], device='cuda:0', grad_fn=<SliceBackward0>)
************
Layer: linear_relu_stack.4.weight
Size: torch.Size([10, 512])
Values : tensor([[-0.0398, 0.0318, 0.0068, ..., 0.0177, 0.0350, 0.0077],
[-0.0229, -0.0109, 0.0351, ..., 0.0355, 0.0282, 0.0047]],
device='cuda:0', grad_fn=<SliceBackward0>)
************
Layer: linear_relu_stack.4.bias
Size: torch.Size([10])
Values : tensor([0.0382, 0.0035], device='cuda:0', grad_fn=<SliceBackward0>)
************