PyTorch

1. Tensor

import torch
torch.FloatTensor(2, 3)
torch.IntTensor(2, 3)
x.type()

Generate random data:

torch.rand(2, 3) # torch.FloatTensor 2*3 uniform distribution [0,1]
torch.randn(2, 3) # torch.FloatTensor 2*3 normal N(0,1)
torch.range(1, 3) # 1,2,3
torch.zeros(2, 3) # torch.FloatTensor 2*3 all 0

Operations:

torch.abs(a)
c = torch.add(a, b)
torch.clamp(a, -0.1, 0.1)   # all elements ranged in [-.1, .1]
torch.div(a, b)  # element division
torch.mul(a, b)  # element multiplication
torch.pow(a, 2)  
torch.mm(a, b)   # matrix multiplication
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
x = torch.randn(3, 3, dtype=torch.float64, device=device)
x = torch.zeros(1, requires_grad=True)

x.to(device) # often used to convert the data to GPU
model = MyModule(...).to(device)
labels = labels.to(device=device, dtype=torch.int64)

tensor.data
cpu_imgs.cuda()   # cpu tensor -> gpu tensor
gpu_imgs.cpu()    # gpu tensor -> cpu tensor
torch.from_numpy(imgs) # numpy -> cpu tensor
cpu_imgs.numpy()    # cpu tensor -> numpy
# GPU tensor cannot convert to numpy directly,must set as CPU tensor first
loss_output.item()  # if the tensor is a scalar

torch.tensor vs torch.Tensor torch.tensor can create the corresponding tensor type according to the input data type, while torch.Tensor can only create a torch.FloatTensor type.

Multiple GPU:

net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) # device_ids: list of GPU ids
out = net(input)

# Or
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
model = nn.DataParallel(model)

# Save the model to multiple GPUs if necessary
if len(gpu_ids) > 1:
  torch.save(model.module.state_dict(), "model.pth")
else:
  torch.save(model.state_dict(), "model.pth")

How to reshape the tensor:

Change the shape

import torch

a = torch.rand(4, 1, 28, 28)
print(a.shape)
print(a.view(4 * 1, 28, 28).shape)
print(a.reshape(4 * 1, 28, 28).shape)
print(a.reshape(4, 1 * 28 * 28).shape)
print(a.view(-1, 8).shape)
# output
torch.Size([4, 1, 28, 28])
torch.Size([4, 28, 28])
torch.Size([4, 28, 28])
torch.Size([4, 784])
torch.Size([392, 8])

Add a dimension

a = torch.randn(4, 1, 28, 28)
print(a.shape)
print(a.unsqueeze(0).shape)
print(a.unsqueeze(-1).shape)
print(a.unsqueeze(-5).shape)
# similar to unsqueeze
print(torch.rand(3,4,5)[..., None, None].shape)

# output
torch.Size([4, 1, 28, 28])
torch.Size([1, 4, 1, 28, 28])
torch.Size([4, 1, 28, 28, 1])
torch.Size([1, 4, 1, 28, 28])
torch.Size([3, 4, 5, 1, 1]) # add two dimensions at the end without changing the data order

Delete a dimension:

a = torch.Tensor(1, 4, 1, 9)
print(a.shape)
print(a.squeeze().shape)  # delete all the dimensions with size 1
print(a.squeeze(0).shape)
print(a.squeeze(2).shape) 
# output
torch.Size([1, 4, 1, 9])
torch.Size([4, 9])
torch.Size([4, 1, 9])
torch.Size([1, 4, 9])

Repeat the dimension:

b = torch.Tensor(1, 32, 1, 1)
print(b.shape)
b = b.repeat(4, 1, 14, 14)
print(b.shape)
# output
torch.Size([1, 32, 1, 1])
torch.Size([4, 32, 14, 14])

Transpose the dimension:

d = torch.Tensor(6, 3, 1, 2)
print(d.transpose(1, 3).shape) # torch.Size([6, 2, 1, 3])

a = torch.rand(4, 3, 6, 7)
print(a.permute(0, 2, 3, 1).shape) # torch.Size([4, 6, 7, 3])

2. How to build a NN

import torch
# initial parameters 
batch_n = 100
hidden_unit = 100
input_data = 1000
output_data = 10
epoch_n = 20
learning_rate = 1e-6
# initial weights
x = torch.randn(batch_n,input_data)
y = torch.randn(batch_n,output_data)
w1 = torch.randn(input_data,hidden_unit)
w2 = torch.randn(hidden_unit,output_data)
# Gradient descent; train the model
for epoch in range(epoch_n):
    h1 = x.mm(w1)  # 100*1000
    h1 = h1.clamp(min=0)
    y_pred = h1.mm(w2)  # 100*10
    loss = (y_pred - y).pow(2).sum()
    print("Epoch:{} , Loss:{:.4f}".format(epoch, loss))
    gray_y_pred = 2 * (y_pred - y)
    gray_w2 = h1.t().mm(gray_y_pred)
    
    grad_h = gray_y_pred.clone()
    grad_h = grad_h.mm(w2.t())
    grad_h.clamp_(min=0)
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * gray_w2

3. Autograd

import torch
import torch.nn
from torch.autograd import Variable

class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
    
    def forward(self,input,w1,w2):
        x = torch.mm(input,w1)
        x = torch.clamp(x,min=0)
        x = torch.mm(x,w2)
        return x
      
# initial parameters
batch_n = 100
hidden_layer = 100
input_data = 1000
output_data = 10
 
x = Variable(torch.randn(batch_n , input_data) , requires_grad = False)
y = Variable(torch.randn(batch_n , output_data) , requires_grad = False)
 
w1 = Variable(torch.randn(input_data,hidden_layer),requires_grad = True)
w2 = Variable(torch.randn(hidden_layer,output_data),requires_grad = True)

# set train parameters
epoch_n = 20
learning_rate = 1e-6

for epoch in range(epoch_n):
    y_pred = model(x, w1, w2)
    loss = (y_pred - y).pow(2).sum()
    print("Epoch:{} , Loss:{:.4f}".format(epoch, loss.data[0]))
		
    loss.backward()
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
 		
    w1.grad.data.zero_()
    w2.grad.data.zero_()

4. PyTorch Programming

import argparse
import os
import numpy as np
import math
import itertools

import torchvision.transforms as transforms
from torchvision.utils import save_image

from torch.utils.data import DataLoader
from torchvision import datasets
from torch.autograd import Variable

import torch.nn as nn
import torch.nn.functional as F
import torch

os.makedirs("results/images/", exist_ok=True)
# set the parameters
parser = argparse.ArgumentParser()
parser.add_argument("--n_epochs", type=int, default=200, help="number of epochs of training")
parser.add_argument("--batch_size", type=int, default=64, help="size of the batches")
parser.add_argument("--lr", type=float, default=0.0002, help="adam: learning rate")
parser.add_argument("--b1", type=float, default=0.5, help="adam: decay of first order momentum of gradient")
parser.add_argument("--b2", type=float, default=0.999, help="adam: decay of first order momentum of gradient")
opt = parser.parse_args()

4.1 Coding Flow

  1. Data

    • Prepare for the train data, test data; input, output; mini-batch
    • Tensor type (CPU or GPU)
    • Parameters (.yaml or parser)
  2. Model (NN structure)

    • Define the model structure $f: X\rightarrow \hat{Y}$
  3. Loss function, optimizer

    • cross entropy loss …
    • Adam …
  4. Training the model

    • Mini-batch training
    • Backward

4.2 Data Processing

# mnist for example
# Configure data loader
os.makedirs("../../data/mnist", exist_ok=True)
dataloader = torch.utils.data.DataLoader(
    datasets.MNIST(
        "../../data/mnist",
        train=True,
        download=True,
        transform=transforms.Compose(
            [transforms.Resize(opt.img_size), transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
        ),
    ),
    batch_size=opt.batch_size,
    shuffle=True,
)

cuda = True if torch.cuda.is_available() else False
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if cuda else torch.LongTensor

Normally, the framework is:

dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=32)
for i, data in enumerate(dataLoader, 0):
	x, y = data

4.3 Construct the model

Normally, the model is defined as following:

class Model(nn.Module):
  	def __init__(self):
      	super(Model, self).__init__()
        
    def forward(self, input)
     	return  ...
mymodel = Model()

More concretely:

class Net(nn.Module):
    def __init__(self, input_size=2, hidden_size=100, sigma=0.02):
      super().__init__()
      self.fc1 = nn.Linear(input_size, hidden_size)
      self.fc2 = nn.Linear(hidden_size, hidden_size)
      self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, input):
      output = F.elu(self.fc1(input))
      output = F.elu(self.fc2(output))
      output = self.fc3(output)
      return output

or

models = torch.nn.Sequential(
    torch.nn.Linear(input_data,hidden_layer),
    torch.nn.ReLU(),
    torch.nn.Linear(hidden_layer,output_data)
)

To cuda if GPU is avaliable:

model = Net()

if cuda:
    model.cuda()

4.4 Loss function & optimizer

# Loss functions
criterion = torch.nn.MSELoss()
criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2))

4.5 Train the model

for epoch in range(opt.n_epochs):
    for i, (imgs, labels) in enumerate(dataloader):
        batch_size = imgs.shape[0]
        # Configure input
        X_data = Variable(imgs.type(FloatTensor))
        labels = to_categorical(labels.numpy(), num_columns=opt.n_classes)
        y_pred = model(X_data)
       	
        loss = criterion(y_pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

5 How to save the model

chkpt_name = 'mymodel'+'.pt'
torch.save({
        'traindata': traindata,
        'MODEL_PATH': model.state_dict()
    }, chkpt_name)

# load the model
checkpoint = torch.load(
        chkpt_name, map_location='cuda' if torch.cuda.is_available() else 'cpu')

data = checkpoint['traindata']
model = checkpoint['MODEL_PATH']

6 nn.Conv2d

API: nn.Conv2d

torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, \
dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)

# NOTE: torch.nn.Conv2d is a class, we need to initialize it first before using it
# in_channels (int) – Number of channels in the input image
# out_channels (int) – Number of channels produced by the convolution
# kernel_size (int or tuple) – Size of the convolving kernel; if int, square kernel; if tuple, kernel size is (kernel_size[0], kernel_size[1])
# stride (int or tuple, optional) – Stride of the convolution. Default: 1
# padding (int or tuple, optional) – Zero-padding added to both sides of the input. Default: 0
# dilation (int or tuple, optional) – Spacing between kernel elements. Default: 1
# groups (int, optional) – Number of blocked connections from input channels to output channels. Default: 1
# bias (bool, optional) – If True, adds a learnable bias to the output. Default: True

torch.nn.functional.conv2d(input, weight, bias=None, stride=1, \
padding=0, dilation=1, groups=1)
# NOTE: torch.nn.functional.conv2d is a function, we can use it directly
# input (Tensor) – input tensor of shape (minibatch x in_channels x iH x iW)
# weight (Tensor) – filters of shape (out_channels x in_channels x kH x kW) e.g., conv_layer.weight

torch.nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride=1, \
padding=0, output_padding=0, groups=1, bias=True, dilation=1, padding_mode='zeros', device=None, dtype=None)
# Note: stride controls the stride for the cross-correlation.
# in_channels (int) – Number of channels in the input image
# out_channels (int) – Number of channels produced by the convolution
# kernel_size (int or tuple) – Size of the convolving kernel
# stride (int or tuple, optional) – Stride of the convolution. Default: 1
# padding (int or tuple, optional) – dilation * (kernel_size - 1) - padding zero-padding will be added to both sides of each dimension in the input. Default: 0; padding = (kernel_size - 1)/2 if want to only use stride to control the dimension, i.e., (4, 3, 10, 10), stride=2, -> (4, 3, 20, 20)
# output_padding (int or tuple, optional) – Additional size added to one side of each dimension in the output shape. Default: 0 output_padding=1, # needed when stride=2; stride-1
# groups (int, optional) – Number of blocked connections from input channels to output channels. Default: 1
# bias (bool, optional) – If True, adds a learnable bias to the output. Default: True
# dilation (int or tuple, optional) – Spacing between kernel elements. Default: 1
# E.g., AE
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential( # like the Composition layer you built
            nn.Conv2d(1, 16, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, 3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, 7)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, 7),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 1, 3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

CNNs are also known as shift invariant or space invariant artificial neural networks (SIANN), based on their shared-weights architecture and translation invariance characteristics.

Core idea: Weight sharing.

An image with RGB format: (channel, width, height)

Train data shape: (batchsize, channel, width, height)

Convolution layer Convolution source: https://waltyou.github.io/images/posts/CNN-kernel-mv.gif

One convolution layer may have many kernels, which depends on the number of input channels. $$(n\times 5\times 5)\rightarrow \text{kernel}(3\times3)\rightarrow(1\times 3\times 3)$$ where the shape of kernel tensor is $(n,3,3)$.

If the output channel is $m$, we can use $m$ filters with the shape $(n,3,3)$ and concatenate the outputs by the channel dimension. Therefore, the kernel can be viewed as $(m, n, 3, 3)$.

in_channel, out_channel = 5, 10
width, height = 100, 100
kernel_size = 3
batch_size = 1

input = torch.randn(batch_size, in_channel, width, height) #(B,N,W,H)
conv_layer = torch.nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size) 

output = conv_layer(input)

print(input.shape) # torch.Size([1, 5, 100, 100])
print(output.shape) #torch.Size([1, 10, 98, 98])
print(conv_layer.weight.shape) #torch.Size([10, 5, 3, 3])

padding: add some data (0) before conv

If we want to keep the output data has the same width and height with the input data, we can padding some data around the input data. For example, if the kernel size is 3, we should set padding=int(3/2)=1

input = torch.randn(5,5)
input = input.view(1, 1, 5, 5)

conv_layer = torch.nn.Conv2d(1, 1, kernel_size=3, padding=1, bias=False)
# kernel : (out_channel, in_channel, w, h)
kernel = torch.Tensor([1,2,3,4,5,6,7,8,9]).view(1,1,3,3) 
conv_layer.weight.data = kernel.data
output = conv_layer(input)

print(output)

# output
tensor([[[[-19.8253, -27.7990, -34.2662,  -8.9551,  -1.4180],
          [-46.2386, -46.0063, -25.9754,   8.6659,  10.2826],
          [-32.1422, -30.5108,   1.8913,   1.4328,   3.5644],
          [-33.4447, -19.4800,   6.2470,  13.1204,   5.6143],
          [-13.3897,  -5.3116,   1.6651,   6.3115,   2.4066]]]],
       grad_fn=<MkldnnConvolutionBackward>)

Stride: move stepsize

Moving step in convolution. In default, stride=1.

input = torch.randn(5,5)
input = input.view(1, 1, 5, 5)

conv_layer = torch.nn.Conv2d(1, 1, kernel_size=3, stride=2, bias=False)

kernel = torch.Tensor([1,2,3,4,5,6,7,8,9]).view(1,1,3,3)
conv_layer.weight.data = kernel.data
output = conv_layer(input)

print(output)

# output
tensor([[[[  2.7587, -17.8712],
          [ 16.0750,   2.3064]]]], grad_fn=<MkldnnConvolutionBackward>)

Pooling layer Pooling source: https://waltyou.github.io/images/posts/CNN-poolfig.gif

In pooling layer, we do not change the channels.

input = torch.randn(4,4)
input = input.view(1, 1, 4, 4)

maxpooling_layer = torch.nn.MaxPool2d(kernel_size=2)

output = maxpooling_layer(input)

print(output)
# output 
tensor([[[[1.0808, 1.2647],
          [0.7619, 0.0221]]]])

How to compute the output dimension

  • What is the dimension of the conv_layer? out_channel*in_channel*kernel_size*kernel_size
  • What is the dimension of the output after conv? batch_size*out_channel*(floor((width+2*padding-kernel_size)/stride)+1)*(floor((height+2*padding-kernel_size)/stride)+1) Important: floor((width+2*padding-kernel_size)/stride)+1
  • What is the dimension of the output after tran_conv? H_out=(H_in−1)×stride[0]−2×padding[0]+dilation[0]×(kernel_size[0]−1)+output_padding[0]+1 if dilation=1, set output_padding=stride-1; padding=(kernel_size-1)/2
  • What is the dimension of the output after pooling? batch_size*out_channel*(floor((width-kernel_size)/stride)+1)*(floor((height-kernel_size)/stride)+1)

multiplechannel source: A-guide-to-convolution-arithmetic-for-deep-learning-Dumoulin-Visin

6 kernels divided as 3 groups, each group has 2 kernels.

Flatten layer

Flatten a tensor to a vector.

data = data.view(batch_size, -1) # flatten

Example

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10,20, kernel_size=5)
        self.pooling = nn.MaxPool2d(2)
        self.fc = nn.Linear(320, 10)

    def forward(self, x):
        batch_size = x.size(0)
        x = F.relu(self.pooling(self.conv1(x)))
        x = F.relu(self.pooling(self.conv2(x)))
        x = x.view(batch_size, -1)
        x = self.fc(x)
        return x
      
Model = Net() 

Some tips for python programming

typing

In the function greeting, the argument name is expected to be of type str and the return type str. Subtypes are accepted as arguments.

# https://docs.python.org/3/library/typing.html
def greeting(name: str) -> str:
    return 'Hello ' + name