PyTorch
1. Tensor
import torch
torch.FloatTensor(2, 3)
torch.IntTensor(2, 3)
x.type()
Generate random data:
torch.rand(2, 3) # torch.FloatTensor 2*3 uniform distribution [0,1]
torch.randn(2, 3) # torch.FloatTensor 2*3 normal N(0,1)
torch.range(1, 3) # 1,2,3
torch.zeros(2, 3) # torch.FloatTensor 2*3 all 0
Operations:
torch.abs(a)
c = torch.add(a, b)
torch.clamp(a, -0.1, 0.1) # all elements ranged in [-.1, .1]
torch.div(a, b) # element division
torch.mul(a, b) # element multiplication
torch.pow(a, 2)
torch.mm(a, b) # matrix multiplication
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
x = torch.randn(3, 3, dtype=torch.float64, device=device)
x = torch.zeros(1, requires_grad=True)
x.to(device) # often used to convert the data to GPU
model = MyModule(...).to(device)
labels = labels.to(device=device, dtype=torch.int64)
tensor.data
cpu_imgs.cuda() # cpu tensor -> gpu tensor
gpu_imgs.cpu() # gpu tensor -> cpu tensor
torch.from_numpy(imgs) # numpy -> cpu tensor
cpu_imgs.numpy() # cpu tensor -> numpy
# GPU tensor cannot convert to numpy directly,must set as CPU tensor first
loss_output.item() # if the tensor is a scalar
torch.tensor vs torch.Tensor
torch.tensor
can create the corresponding tensor type according to the input data type, while torch.Tensor
can only create a torch.FloatTensor
type.
Multiple GPU:
net = torch.nn.DataParallel(model, device_ids=[0, 1, 2]) # device_ids: list of GPU ids
out = net(input)
# Or
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
model = nn.DataParallel(model)
# Save the model to multiple GPUs if necessary
if len(gpu_ids) > 1:
torch.save(model.module.state_dict(), "model.pth")
else:
torch.save(model.state_dict(), "model.pth")
How to reshape the tensor:
Change the shape
import torch
a = torch.rand(4, 1, 28, 28)
print(a.shape)
print(a.view(4 * 1, 28, 28).shape)
print(a.reshape(4 * 1, 28, 28).shape)
print(a.reshape(4, 1 * 28 * 28).shape)
print(a.view(-1, 8).shape)
# output
torch.Size([4, 1, 28, 28])
torch.Size([4, 28, 28])
torch.Size([4, 28, 28])
torch.Size([4, 784])
torch.Size([392, 8])
Add a dimension
a = torch.randn(4, 1, 28, 28)
print(a.shape)
print(a.unsqueeze(0).shape)
print(a.unsqueeze(-1).shape)
print(a.unsqueeze(-5).shape)
# similar to unsqueeze
print(torch.rand(3,4,5)[..., None, None].shape)
# output
torch.Size([4, 1, 28, 28])
torch.Size([1, 4, 1, 28, 28])
torch.Size([4, 1, 28, 28, 1])
torch.Size([1, 4, 1, 28, 28])
torch.Size([3, 4, 5, 1, 1]) # add two dimensions at the end without changing the data order
Delete a dimension:
a = torch.Tensor(1, 4, 1, 9)
print(a.shape)
print(a.squeeze().shape) # delete all the dimensions with size 1
print(a.squeeze(0).shape)
print(a.squeeze(2).shape)
# output
torch.Size([1, 4, 1, 9])
torch.Size([4, 9])
torch.Size([4, 1, 9])
torch.Size([1, 4, 9])
Repeat the dimension:
b = torch.Tensor(1, 32, 1, 1)
print(b.shape)
b = b.repeat(4, 1, 14, 14)
print(b.shape)
# output
torch.Size([1, 32, 1, 1])
torch.Size([4, 32, 14, 14])
Transpose the dimension:
d = torch.Tensor(6, 3, 1, 2)
print(d.transpose(1, 3).shape) # torch.Size([6, 2, 1, 3])
a = torch.rand(4, 3, 6, 7)
print(a.permute(0, 2, 3, 1).shape) # torch.Size([4, 6, 7, 3])
2. How to build a NN
import torch
# initial parameters
batch_n = 100
hidden_unit = 100
input_data = 1000
output_data = 10
epoch_n = 20
learning_rate = 1e-6
# initial weights
x = torch.randn(batch_n,input_data)
y = torch.randn(batch_n,output_data)
w1 = torch.randn(input_data,hidden_unit)
w2 = torch.randn(hidden_unit,output_data)
# Gradient descent; train the model
for epoch in range(epoch_n):
h1 = x.mm(w1) # 100*1000
h1 = h1.clamp(min=0)
y_pred = h1.mm(w2) # 100*10
loss = (y_pred - y).pow(2).sum()
print("Epoch:{} , Loss:{:.4f}".format(epoch, loss))
gray_y_pred = 2 * (y_pred - y)
gray_w2 = h1.t().mm(gray_y_pred)
grad_h = gray_y_pred.clone()
grad_h = grad_h.mm(w2.t())
grad_h.clamp_(min=0)
grad_w1 = x.t().mm(grad_h)
w1 -= learning_rate * grad_w1
w2 -= learning_rate * gray_w2
3. Autograd
import torch
import torch.nn
from torch.autograd import Variable
class Model(nn.Module):
def __init__(self):
super(Model,self).__init__()
def forward(self,input,w1,w2):
x = torch.mm(input,w1)
x = torch.clamp(x,min=0)
x = torch.mm(x,w2)
return x
# initial parameters
batch_n = 100
hidden_layer = 100
input_data = 1000
output_data = 10
x = Variable(torch.randn(batch_n , input_data) , requires_grad = False)
y = Variable(torch.randn(batch_n , output_data) , requires_grad = False)
w1 = Variable(torch.randn(input_data,hidden_layer),requires_grad = True)
w2 = Variable(torch.randn(hidden_layer,output_data),requires_grad = True)
# set train parameters
epoch_n = 20
learning_rate = 1e-6
for epoch in range(epoch_n):
y_pred = model(x, w1, w2)
loss = (y_pred - y).pow(2).sum()
print("Epoch:{} , Loss:{:.4f}".format(epoch, loss.data[0]))
loss.backward()
w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data
w1.grad.data.zero_()
w2.grad.data.zero_()
4. PyTorch Programming
import argparse
import os
import numpy as np
import math
import itertools
import torchvision.transforms as transforms
from torchvision.utils import save_image
from torch.utils.data import DataLoader
from torchvision import datasets
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch
os.makedirs("results/images/", exist_ok=True)
# set the parameters
parser = argparse.ArgumentParser()
parser.add_argument("--n_epochs", type=int, default=200, help="number of epochs of training")
parser.add_argument("--batch_size", type=int, default=64, help="size of the batches")
parser.add_argument("--lr", type=float, default=0.0002, help="adam: learning rate")
parser.add_argument("--b1", type=float, default=0.5, help="adam: decay of first order momentum of gradient")
parser.add_argument("--b2", type=float, default=0.999, help="adam: decay of first order momentum of gradient")
opt = parser.parse_args()
4.1 Coding Flow
Data
- Prepare for the train data, test data; input, output; mini-batch
- Tensor type (CPU or GPU)
- Parameters (.yaml or parser)
Model (NN structure)
- Define the model structure $f: X\rightarrow \hat{Y}$
Loss function, optimizer
- cross entropy loss …
- Adam …
Training the model
- Mini-batch training
- Backward
4.2 Data Processing
# mnist for example
# Configure data loader
os.makedirs("../../data/mnist", exist_ok=True)
dataloader = torch.utils.data.DataLoader(
datasets.MNIST(
"../../data/mnist",
train=True,
download=True,
transform=transforms.Compose(
[transforms.Resize(opt.img_size), transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
),
),
batch_size=opt.batch_size,
shuffle=True,
)
cuda = True if torch.cuda.is_available() else False
FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if cuda else torch.LongTensor
Normally, the framework is:
dataloader = torch.utils.data.DataLoader(dataset, shuffle=True, batch_size=32)
for i, data in enumerate(dataLoader, 0):
x, y = data
4.3 Construct the model
Normally, the model is defined as following:
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
def forward(self, input)
return ...
mymodel = Model()
More concretely:
class Net(nn.Module):
def __init__(self, input_size=2, hidden_size=100, sigma=0.02):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, 1)
def forward(self, input):
output = F.elu(self.fc1(input))
output = F.elu(self.fc2(output))
output = self.fc3(output)
return output
or
models = torch.nn.Sequential(
torch.nn.Linear(input_data,hidden_layer),
torch.nn.ReLU(),
torch.nn.Linear(hidden_layer,output_data)
)
To cuda if GPU is avaliable:
model = Net()
if cuda:
model.cuda()
4.4 Loss function & optimizer
# Loss functions
criterion = torch.nn.MSELoss()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2))
4.5 Train the model
for epoch in range(opt.n_epochs):
for i, (imgs, labels) in enumerate(dataloader):
batch_size = imgs.shape[0]
# Configure input
X_data = Variable(imgs.type(FloatTensor))
labels = to_categorical(labels.numpy(), num_columns=opt.n_classes)
y_pred = model(X_data)
loss = criterion(y_pred, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
5 How to save the model
chkpt_name = 'mymodel'+'.pt'
torch.save({
'traindata': traindata,
'MODEL_PATH': model.state_dict()
}, chkpt_name)
# load the model
checkpoint = torch.load(
chkpt_name, map_location='cuda' if torch.cuda.is_available() else 'cpu')
data = checkpoint['traindata']
model = checkpoint['MODEL_PATH']
6 nn.Conv2d
API: nn.Conv2d
torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, \
dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)
# NOTE: torch.nn.Conv2d is a class, we need to initialize it first before using it
# in_channels (int) – Number of channels in the input image
# out_channels (int) – Number of channels produced by the convolution
# kernel_size (int or tuple) – Size of the convolving kernel; if int, square kernel; if tuple, kernel size is (kernel_size[0], kernel_size[1])
# stride (int or tuple, optional) – Stride of the convolution. Default: 1
# padding (int or tuple, optional) – Zero-padding added to both sides of the input. Default: 0
# dilation (int or tuple, optional) – Spacing between kernel elements. Default: 1
# groups (int, optional) – Number of blocked connections from input channels to output channels. Default: 1
# bias (bool, optional) – If True, adds a learnable bias to the output. Default: True
torch.nn.functional.conv2d(input, weight, bias=None, stride=1, \
padding=0, dilation=1, groups=1)
# NOTE: torch.nn.functional.conv2d is a function, we can use it directly
# input (Tensor) – input tensor of shape (minibatch x in_channels x iH x iW)
# weight (Tensor) – filters of shape (out_channels x in_channels x kH x kW) e.g., conv_layer.weight
torch.nn.ConvTranspose2d(in_channels, out_channels, kernel_size, stride=1, \
padding=0, output_padding=0, groups=1, bias=True, dilation=1, padding_mode='zeros', device=None, dtype=None)
# Note: stride controls the stride for the cross-correlation.
# in_channels (int) – Number of channels in the input image
# out_channels (int) – Number of channels produced by the convolution
# kernel_size (int or tuple) – Size of the convolving kernel
# stride (int or tuple, optional) – Stride of the convolution. Default: 1
# padding (int or tuple, optional) – dilation * (kernel_size - 1) - padding zero-padding will be added to both sides of each dimension in the input. Default: 0; padding = (kernel_size - 1)/2 if want to only use stride to control the dimension, i.e., (4, 3, 10, 10), stride=2, -> (4, 3, 20, 20)
# output_padding (int or tuple, optional) – Additional size added to one side of each dimension in the output shape. Default: 0 output_padding=1, # needed when stride=2; stride-1
# groups (int, optional) – Number of blocked connections from input channels to output channels. Default: 1
# bias (bool, optional) – If True, adds a learnable bias to the output. Default: True
# dilation (int or tuple, optional) – Spacing between kernel elements. Default: 1
# E.g., AE
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
self.encoder = nn.Sequential( # like the Composition layer you built
nn.Conv2d(1, 16, 3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(16, 32, 3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(32, 64, 7)
)
self.decoder = nn.Sequential(
nn.ConvTranspose2d(64, 32, 7),
nn.ReLU(),
nn.ConvTranspose2d(32, 16, 3, stride=2, padding=1, output_padding=1),
nn.ReLU(),
nn.ConvTranspose2d(16, 1, 3, stride=2, padding=1, output_padding=1),
nn.Sigmoid()
)
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
CNNs are also known as shift invariant or space invariant artificial neural networks (SIANN), based on their shared-weights architecture and translation invariance characteristics.
Core idea: Weight sharing.
An image with RGB format: (channel, width, height)
Train data shape: (batchsize, channel, width, height)
Convolution layer source: https://waltyou.github.io/images/posts/CNN-kernel-mv.gif
One convolution layer may have many kernels, which depends on the number of input channels. $$(n\times 5\times 5)\rightarrow \text{kernel}(3\times3)\rightarrow(1\times 3\times 3)$$ where the shape of kernel tensor is $(n,3,3)$.
If the output channel is $m$, we can use $m$ filters with the shape $(n,3,3)$ and concatenate the outputs by the channel dimension. Therefore, the kernel can be viewed as $(m, n, 3, 3)$.
in_channel, out_channel = 5, 10
width, height = 100, 100
kernel_size = 3
batch_size = 1
input = torch.randn(batch_size, in_channel, width, height) #(B,N,W,H)
conv_layer = torch.nn.Conv2d(in_channel, out_channel, kernel_size=kernel_size)
output = conv_layer(input)
print(input.shape) # torch.Size([1, 5, 100, 100])
print(output.shape) #torch.Size([1, 10, 98, 98])
print(conv_layer.weight.shape) #torch.Size([10, 5, 3, 3])
padding: add some data (0) before conv
If we want to keep the output data has the same width and height with the input data, we can padding some data around the input data. For example, if the kernel size is 3, we should set padding=int(3/2)=1
input = torch.randn(5,5)
input = input.view(1, 1, 5, 5)
conv_layer = torch.nn.Conv2d(1, 1, kernel_size=3, padding=1, bias=False)
# kernel : (out_channel, in_channel, w, h)
kernel = torch.Tensor([1,2,3,4,5,6,7,8,9]).view(1,1,3,3)
conv_layer.weight.data = kernel.data
output = conv_layer(input)
print(output)
# output
tensor([[[[-19.8253, -27.7990, -34.2662, -8.9551, -1.4180],
[-46.2386, -46.0063, -25.9754, 8.6659, 10.2826],
[-32.1422, -30.5108, 1.8913, 1.4328, 3.5644],
[-33.4447, -19.4800, 6.2470, 13.1204, 5.6143],
[-13.3897, -5.3116, 1.6651, 6.3115, 2.4066]]]],
grad_fn=<MkldnnConvolutionBackward>)
Stride: move stepsize
Moving step in convolution. In default, stride=1.
input = torch.randn(5,5)
input = input.view(1, 1, 5, 5)
conv_layer = torch.nn.Conv2d(1, 1, kernel_size=3, stride=2, bias=False)
kernel = torch.Tensor([1,2,3,4,5,6,7,8,9]).view(1,1,3,3)
conv_layer.weight.data = kernel.data
output = conv_layer(input)
print(output)
# output
tensor([[[[ 2.7587, -17.8712],
[ 16.0750, 2.3064]]]], grad_fn=<MkldnnConvolutionBackward>)
Pooling layer source: https://waltyou.github.io/images/posts/CNN-poolfig.gif
In pooling layer, we do not change the channels.
input = torch.randn(4,4)
input = input.view(1, 1, 4, 4)
maxpooling_layer = torch.nn.MaxPool2d(kernel_size=2)
output = maxpooling_layer(input)
print(output)
# output
tensor([[[[1.0808, 1.2647],
[0.7619, 0.0221]]]])
How to compute the output dimension
- What is the dimension of the conv_layer?
out_channel*in_channel*kernel_size*kernel_size
- What is the dimension of the output after conv?
batch_size*out_channel*(floor((width+2*padding-kernel_size)/stride)+1)*(floor((height+2*padding-kernel_size)/stride)+1)
Important:floor((width+2*padding-kernel_size)/stride)+1
- What is the dimension of the output after tran_conv?
H_out=(H_in−1)×stride[0]−2×padding[0]+dilation[0]×(kernel_size[0]−1)+output_padding[0]+1
if dilation=1, set output_padding=stride-1; padding=(kernel_size-1)/2
- What is the dimension of the output after pooling?
batch_size*out_channel*(floor((width-kernel_size)/stride)+1)*(floor((height-kernel_size)/stride)+1)
source: A-guide-to-convolution-arithmetic-for-deep-learning-Dumoulin-Visin
6 kernels divided as 3 groups, each group has 2 kernels.
Flatten layer
Flatten a tensor to a vector.
data = data.view(batch_size, -1) # flatten
Example
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10,20, kernel_size=5)
self.pooling = nn.MaxPool2d(2)
self.fc = nn.Linear(320, 10)
def forward(self, x):
batch_size = x.size(0)
x = F.relu(self.pooling(self.conv1(x)))
x = F.relu(self.pooling(self.conv2(x)))
x = x.view(batch_size, -1)
x = self.fc(x)
return x
Model = Net()
Some tips for python programming
typing
In the function greeting, the argument name is expected to be of type str and the return type str. Subtypes are accepted as arguments.
# https://docs.python.org/3/library/typing.html
def greeting(name: str) -> str:
return 'Hello ' + name