PyTorch
Wanho Choi
PyTorch
Open source and free machine learning library based on the Torch library
Developed by FAIR’s (Facebook AI Research) Group
PyTorch vs TensorFlow
•
PyTorch
‣
Define-and-Run
•
TensorFlow
‣
Define-by-Run
: peculiar framework
PyTorch vs TensorFlow
Version
import torch
print(torch.__version__) # 1.1.0
Facebook CTO Mike Schroepfer announces the release of PyTorch 1.0 at Facebook developer conference F8 on May 2, 2018 at the McEnery Convention Center in San Jose, California (Image Credit: Facebook)
Tensor
•
Simply
‣
Multi-dimensional array
‣
Generalized matrix
•
Strictly
‣
Matrix: just a collection of numbers inside brackets
‣
Tensors have some transformation properties when changing coordinate system.
•
In PyTorch
‣
Unit of data
Tensor
0D tensor
1D tensor
4D tensor
2D tensor
3D tensor
scalar
vector
matrix
cube
a vector of cube
Tensor
0D tensor
1D tensor
4D tensor
2D tensor
3D tensor
variable
array
gray scale image
RGB image
Tensor: shape
import torch
a = torch.tensor(3.14)
print(a) # tensor(3.1400)
print(a.shape, a.size()) # torch.Size([]) torch.Size([]) b = torch.tensor([1.414])
print(b) # tensor([1.4140])
print(b.shape, b.size()) # torch.Size([1]) torch.Size([1]) c = torch.tensor([1., 2., 3.])
print(c) # tensor([1., 2., 3.])
print(c.shape, c.size()) # torch.Size([3]) torch.Size([3]) d = torch.tensor([[1, 2], [3, 4], [5, 6]])
print(d) # tensor([[1, 2], [3, 4], [5, 6]])
print(d.shape, d.size()) # torch.Size([3, 2]) torch.Size([3, 2]) e = torch.tensor([[[1, 2, 3], [3, 4, 5]], [[5, 6, 7], [7, 8, 9]]])
print(e) # tensor([[[1, 2, 3], [3, 4, 5]], [[5, 6, 7], [7, 8, 9]]]) print(e.shape, e.size()) # torch.Size([2, 2, 3]) torch.Size([2, 2, 3])
print(e.shape[0], e.shape[1], e.shape[2]) # 2 2 3 print(e.size(0), e.size(1), e.size(2)) # 2 2 3
Tensor: dtype
import numpy as np import torch a = np.array([[1, 2], [3, 4]]) print(a.dtype) # int64 b = np.array([[1., 2.], [3., 4.]]) print(b.dtype) # float64 aa = torch.from_numpy(a) print(aa.dtype) # torch.int64 bb = torch.from_numpy(b) print(bb.dtype) # torch.float64 aa = torch.from_numpy(a).float() print(aa.dtype) # torch.float32 aa = torch.FloatTensor(a) print(aa.dtype) # torch.float32 a = aa.int() print(a.dtype) # torch.int32Tensor: data
import torch
a = torch.tensor([[1, 2], [3, 4]])
print(type(a)) # <class ‘torch.Tensor'> print(a) # tensor([[1, 2], [3, 4]]) print(a[0][0], a[0][1]) # tensor(1) tensor(2)
print(a[1][0], a[1][1]) # tensor(3) tensor(4) print(a[0][0].item()) # 1
print(a[0][1].item()) # 2 print(a[1][0].item()) # 3 print(a[1][1].item()) # 4
b = a.data
print(type(b)) # <class ‘torch.Tensor'> print(b) # tensor([[1, 2], [3, 4]]) print(b[0][0], b[0][1]) # tensor(1) tensor(2)
print(b[1][0], b[1][1]) # tensor(3) tensor(4) print(b[0][0].item()) # 1
print(b[0][1].item()) # 2 print(b[1][0].item()) # 3 print(b[1][1].item()) # 4
Interoperability with NumPy
import numpy as np import torch
a = [[1, 2], [3, 4]]
print(type(a)) # <class ‘list> b = np.array(a)
print(type(b)) # <class ‘numpy.ndarray> c = torch.tensor(b)
print(type(c)) # <class ‘torch.Tensor> c = torch.from_numpy(b)
print(type(c)) # <class ‘torch.Tensor> c = torch.as_tensor(b)
print(type(c)) # <class ‘torch.Tensor> d = c.numpy()
print(type(d)) # <class 'numpy.ndarray>
NumPy
Array
PyTorch
Tensor
b = torch.
from_numpy
(a)
a = b.
numpy
()
b = torch.
tensor
(a)
Copy vs Reference
•
PyTorch tensors share the memory buffer of NumPy ndarrays.
•
Thus, changing one will affects the others.
•
torch.tensor() allocates a new memory by copying the ndarray.
•
torch.tensor() is an alias for the default tensor type, e.g. torch.FloatTensor().
•
If you want avoid a copy, use torch.from_numpy() or torch.as_tensor().
Copy vs Reference
import numpy as np import torch a = np.array([[1.]]) b = torch.tensor(a) # copy c = torch.FloatTensor(a) # copy d = torch.from_numpy(a) # reference e = torch.as_tensor(a) # reference print(b.item()) # 1.0 print(c.item()) # 1.0 print(d.item()) # 1.0 print(e.item()) # 1.0 a[0][0] = 2 print(b.item()) # 1.0 print(c.item()) # 1.0 print(d.item()) # 2.0 print(e.item()) # 2.0 a[0][0] = 0 b[0][0] = 111 print(a[0][0]) # 0.0 c[0][0] = 222 print(a[0][0]) # 0.0 d[0][0] = 333 print(a[0][0]) # 333 e[0][0] = 444 print(a[0][0]) # 444a = np.array([[12345.]]) # a new object print(b.item()) # 111.0 print(c.item()) # 222.0 print(d.item()) # 333.0 print(e.item()) # 444.0
2/2
1/2
Tensors on GPU
import numpy as np import torch a = np.array([1, 2, 3]) c = torch.from_numpy(a).float() print(c) # tensor([1., 2., 3.]) c = torch.from_numpy(a).float().to(‘cpu’) print(c) # tensor([1., 2., 3.]) g = torch.from_numpy(a).float().to(‘cuda’)print(g) # tensor([1., 2., 3.], device=‘cuda:0’)
DEVICE = ‘cuda’ if torch.cuda.is_available() else ‘cpu’ g = torch.from_numpy(a).float().to(DEVICE)
print(g) # tensor([1., 2., 3.], device=‘cuda:0’) if torch.cuda.is_available():
print(torch.cuda.device_count()) # 1: the number of GPUs available
print(torch.cuda.current_device()) # 0: the index of a currently selected device
Random Tensor
import torch a = torch.randn(1) # 1 by 1 tensor b = torch.randn(1) # 1 by 1 tensor print(a, b) c = torch.randn(2, 3) # 2 by 3 tensor print(c)# above this you will get different results for each run torch.manual_seed(123)
# blow this will always produce the same results a = torch.randn(1) # 1 by 1 tensor b = torch.randn(1) # 1 by 1 tensor print(a, b) c = torch.randn(2, 3) # 2 by 3 tensor print(c)
Uniform Distribution
frequency
torch.rand()
68%
95%
Normal Distribution
frequency
torch.randn()
Matrix Multiplication
[
1 2
3 4
5 6] [
1 2 3
4 5 6] =
9 12 15
19 26 33
29 40 51
3 × 3
3 × 2
2 × 3
import torch a = torch.tensor([[1, 2], [3, 4], [5, 6]]) print(a.shape) # torch.Size([3, 2]) b = torch.tensor([[1, 2, 3], [4, 5, 6]]) print(b.shape) # torch.Size([2, 3]) c = torch.mm(a, b) print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]]) c = a.mm(b) print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]]) c = torch.matmul(a, b) print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]]) c = a @ b print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]])Autograd: Automatic Differentiation
An engine for computing gradients (or Jacobians).
Autograd automatically calculates the gradients (or Jacobians) by applying chain rules,
tracing from the root to the leaves of the graph.
If you set a Tensor’s attribute .requires_grad as True, it starts to track all operations on it.
When you finish your computation you can call backward() and have all the gradients (or
Jacobians) computed automatically.
The result will be accumulated into .grad attribute.
.requires_grad
Every Tensor has
a flag: .requires_grad.
Default value: False
If
False, the tensor will be excluded from the gradient computation.
Tensors that must be
included in the gradient computation must explicitly set this flag to True.
input.requires_grad = False ➜ output.requires_grad = False (automatically)
input.requires_grad = True ➜ output.requires_grad = True (automatically)
This flag can also be set when the tensor is first created, and
can be changed later.
.requires_grad
import torch
DEVICE = 'cuda' if torch.cuda.is_available() else ‘cpu' a = torch.rand(1, requires_grad=True).to(DEVICE)
b = torch.rand(1, requires_grad=True).to(DEVICE)
print(a.requires_grad, b.requires_grad) # True True a = torch.rand(1, dtype=torch.float).to(DEVICE)
b = torch.rand(1, dtype=torch.float).to(DEVICE)
print(a.requires_grad, b.requires_grad) # False False a.requires_grad_()
b.requires_grad_()
.requires_grad
import torch
x = torch.tensor([1.0]) # requires_grad=False by default y = torch.tensor([2.0]) # requires_grad=False by default z = torch.tensor([3.0], requires_grad=True) a = x + y print(a.requires_grad) # False b = a + z print(b.requires_grad) # True a.requires_grad_(True) print(a.requires_grad) # True
1
x
2
y
3
z
3
a
6
b
False
True
Autograd
import torch
x = torch.tensor(2.0, requires_grad=True)
print(x) # tensor(2., requires_grad=True) y = 3*x*x + 4*x + 5
print(y) # tensor(25., grad_fn=<AddBackward0>) print(x.grad) # None
y.backward() # compute gradients print(x.grad) # tensor(16.)
x = 2.0
16
x = 2
y = 3x
2
+ 4x + 5
∂y
∂x
= 6x + 4
x = 2
25
Autograd
import torch x = torch.ones(2, 2, requires_grad=True) y = x + 3 z = y*y + 1 f = z.mean()print(f) # tensor(17., grad_fn=<AddBackward0>) f.backward() # gradient computation
print(x.grad) # tensor([[2., 2.], [2., 2.]])
[
1 1
1 1]
[
4 4
4 4]
[
17 17
17 17]
17
y = x + 3
z = y
2+ 1
f = 1
4
∑
4 i=1z
i x y z f∂f
∂x
i
= ∂f
∂z
i
∂z
i
∂y
i
∂y
i
∂x
i
= 1
4
× 2y × 1 = 1
2
y = 1
2
(x + 3)
when
x = 1
∂f
∂x
i
= 2
Autograd
import torch x = torch.tensor(3.0) w = torch.tensor(4.0, requires_grad=True) b = torch.tensor(5.0, requires_grad=True) print(x.item()) # 3.0 print(w.item()) # 4.0 print(b.item()) # 5.0 y = w * x + b print(y.item()) # 17.0y.backward() # gradient computation print(w.grad.item()) # 3.0 print(b.grad.item()) # 1.0
y = wx + b
∂y
∂w
= x
∂y
∂b
= 1
17 = 4 × 3 + 5
no_grad()
import torch x = torch.randn(3, requires_grad=True) print(x.requires_grad) # True y = x + 1 print(y.requires_grad) # True# not to calculate the gradient for the variable z
with torch.no_grad():
z = x + 1
Tensor vs Variable
Variables are wrappers for Tensors.
Variable = Tensor + (= gradient computation)
Variables are the part of the autograd package.
The Variable API has been deprecated:
Variables are no longer necessary to use autograd with Tensors.
Autograd automatically supports Tensors with .requires_grad to True.
In-Place Operations
In PyTorch, many methods exist in two versions:
: with / without an underscore(_) suffix
ex) add(…), add_(…)
The underscore(_) indicates in-place operations in PyTorch.
Methods that ends in an underscore(_) change the tensor in-place.
In general, in-place operations increase performance, but can lead to problems and worse
performance in PyTorch.
It is recommended not to use in-place operations in most cases for efficiency.
In-Place Operations
In-Place Operations
import torch
a = torch.tensor([1, 2, 3])
print(id(a)) # THE RESULT: (1) a += 10
print(id(a)) # THE RESULT: (2) a = a + 10
print(id(a)) # THE RESULT: (3)
# (1) and (2) are same, but (3) different from them. # (2): in-place operator (same object)
# (3): a new object was created a = torch.tensor([1, 2, 3]) b = a + 10 print(b) # tensor([11, 12, 13]) print(a is b) # False b = a.add(10) print(b) # tensor([11, 12, 13]) print(a is b) # False b = a.add_(10) print(b) # tensor([11, 12, 13]) print(a is b) # True
Formulation
y
i= wx
i+ b (i = 1,2,3,⋯, N)
Model:
Error:
L
i= 1
2
( ̂y
i− y
i)
2 known predicted= ̂y
i− (wx
i+ b) = ̂y
i− wx
i− b
∂E
i∂w
= − x
i∂E
i∂b
= − 1
Loss:
E
i= ̂y
i− y
i= 1
2
E
i2∂L
i∂w
=
∂L
i∂E
i∂E
i∂w
= E
i⋅ (−x
i) = − x
iE
i∂L
i∂b
=
∂L
i∂E
i∂E
i∂b
= E
i⋅ (−1) = − E
iC = 1
N
N∑
i=1L
iCost:
∂C
∂w
= 1
N
N∑
i=1∂L
i∂w
= 1
N
N∑
i=1(−x
iE
i)
∂C
∂b
= 1
N
N∑
i=1∂L
i∂b
= 1
N
N∑
i=1(−E
i)
Pure NumPy Implementation
import numpy as np # y = w*x+b: w=2, b=1 x = np.array([1, 2, 3, 4, 5], dtype=‘float32') y = np.array([3, 5, 7, 9, 11], dtype=‘float32') w = np.random.randn(1) # weight b = np.random.randn(1) # biasfor epoch in range(10000): # iteration # prediction
y_predicted = w * x + b error = y - y_predicted
# gradient computation (manually) w_grad = (-x * error).mean() b_grad = (-error).mean() # update w -= 0.01 * w_grad b -= 0.01 * b_grad print(w, b) # [2.00000001] [0.99999998]
∂C
∂w
=
N∑
i=1(−x
iE
i)
∂C
∂b
=
N∑
i=1(−E
i)
known predictedE
i= ̂y
i− y
iPure NumPy Implementation
import numpy as np # y = w*x+b: w=2, b=1 x = np.array([1, 2, 3, 4, 5], dtype=‘float32') y = np.array([3, 5, 7, 9, 11], dtype=‘float32') w = np.random.randn(1) # weight b = np.random.randn(1) # biasfor epoch in range(10000): # iteration # prediction
y_predicted = w * x + b error = y - y_predicted
# gradient computation (manually) w_grad = (-x * error).mean() b_grad = (-error).mean() # update w -= 0.01 * w_grad b -= 0.01 * b_grad print(w, b) # [2.00000001] [0.99999998]
w
n+1= w
n− α ∂E
∂w
b
n+1= b
n− α ∂E
∂b
How does it work?
•
Exactly the same process as solving Ax = b with the steepest descent method
y
i= ax
i+ b (i = 1,2,3,⋯, N)
Model:
E =
∑
N i=1(y
i− ax
i− b)
2argmin
a,bE
∂E
∂a
= 2
N∑
i=1(y
i− ax
i− b) ⋅ (−x
i) = 0
∂E
∂b
= 2
N∑
i=1(y
i− ax
i− b) ⋅ (−1) = 0
a
(
N∑
i=1x
i2)
+ b
(
N∑
i=1x
i)
=
(
N∑
i=1x
iy
i)
a
(
N∑
i=1x
i)
+ bN =
(
N∑
i=1y
i)
∑
Ni=1x
i2∑
Ni=1x
i∑
Ni=1x
iN
[
a
b]
=
∑
N i=1x
iy
i∑
Ni=1y
iAx
= b
f(x) = 1
2
x
T
Ax − b
T
x + c
∂f
∂x
= 0
Ax = b
a
b
(2,1)
energy contours steepest directioninitial random guess
LinearRegressionCommon.py
import numpy as np import torch
# how many points? N = 1000
# the ground-truth values W = 2.0 # weight
B = 1.0 # bias
def Data():
np.random.seed(13) # random seed
x = np.random.rand(N, 1) # input: 1D array
noise = 0.1 * np.random.rand(N, 1) # noise: 1D array
y = (W * x + B) + noise # outout that has some noise
indices = np.arange(N) # point indices
np.random.shuffle(indices) # shuffled indices
train_indices = indices[:round(0.8*N)] # the first 80 random indices for train set
valid_indices = indices[round(0.8*N):] # the the remaining indices for validation set x_train, y_train = x[train_indices], y[train_indices]
x_valid, y_valid = x[valid_indices], y[valid_indices] return x_train, y_train, x_valid, y_valid
Example #1
from LinearRegressionCommon import *
w = np.random.randn(1) # weight
b = np.random.randn(1) # bias
x_train, y_train, _, _ = Data()
for epoch in range(10000): # iteration # prediction
y_predicted = w * x_train + b error = y_train - y_predicted
# gradient computation (manually) w_grad = (-x_train * error).mean() b_grad = (-error).mean()
# update
w -= 0.01 * w_grad b -= 0.01 * b_grad
Example #2
from LinearRegressionCommon import *
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
w = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) b = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE)
x_train, y_train, _, _ = Data()
x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE)
for epoch in range(10000):
y_predicted = w * x_train + b error = y_train - y_predicted cost = (error*error).mean()
cost.backward() # gradient computation (automatically)
with torch.no_grad(): w -= 0.01 * w.grad b -= 0.01 * b.grad w.grad.zero_() b.grad.zero_() print(w.item(), b.item()) # 2.003204107284546 1.048282265663147
Example #3
from LinearRegressionCommon import *
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
w = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) b = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) x_train, y_train, _, _ = Data()
x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE)
optimizer = torch.optim.SGD([w, b], lr=0.01)
for epoch in range(10000):
y_predicted = w * x_train + b error = y_train - y_predicted cost = (error*error).mean()
cost.backward() optimizer.step()
optimizer.zero_grad()
Example #4
from LinearRegressionCommon import *
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
w = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) b = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) x_train, y_train, _, _ = Data()
x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE)
CostFunc = torch.nn.MSELoss()
optimizer = torch.optim.SGD([w, b], lr=0.01) for epoch in range(10000):
y_predicted = w * x_train + b
cost = CostFunc(y_train, y_predicted) cost.backward()
optimizer.step()
optimizer.zero_grad()
Example #5
from LinearRegressionCommon import *
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
class Model(torch.nn.Module): def __init__(self):
super().__init__()
self.w = torch.nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE)) self.b = torch.nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE)) def forward(self, x):
return self.w * x + self.b
x_train, y_train, _, _ = Data()
x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE) model = Model().to(DEVICE) CostFunc = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
1/2
Example #5
for epoch in range(10000): model.train()
y_predicted = model(x_train)
cost = CostFunc(y_train, y_predicted) cost.backward() optimizer.step() optimizer.zero_grad() print(model.state_dict()) print(model.w, model.b) print(model.w.item(), model.b.item()) # 2.003204107284546 1.048282265663147
2/2
Example #6
from LinearRegressionCommon import *
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' class Model(torch.nn.Module): def __init__(self): super().__init__() self.layer = torch.nn.Linear(1, 1) def forward(self, x): return self.layer(x)
x_train, y_train, _, _ = Data()
x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE) model = Model().to(DEVICE) CostFunc = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
1/2
Example #6
for epoch in range(10000): model.train()
y_predicted = model(x_train)
cost = CostFunc(y_train, y_predicted) cost.backward() optimizer.step() optimizer.zero_grad() print(model.state_dict()) print(model.layer.weight, model.layer.bias) print(model.layer.weight.item(), model.layer.bias.item()) # 2.003204107284546 1.048282265663147
2/2
Practical Example
https://medium.com/dsnet/linear-regression-with-pytorch-3dde91d60b50Input
Layer
Hidden
Layer
Output
Layer
import numpy as np import torch
class Model(torch.nn.Module): def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(3, 2) # 3: inputs, 2: outputs def forward(self, x):
return self.layer(x)
# input data: (temperature, rainfall, humidity)
x_train = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], [102, 43, 37], [69, 96, 70]])
# output data: (apples, oranges)
y_train = np.array([[56, 70], [81, 101], [119, 133], [22, 37], [103, 119]]) x_train = torch.from_numpy(x_train).float() y_train = torch.from_numpy(y_train).float() model = Model() CostFunc = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
1/2
Practical Example
for epoch in range(10000): model.train()
y_predicted = model(x_train)
cost = CostFunc(y_train, y_predicted) cost.backward() optimizer.step() optimizer.zero_grad() # test print(model(x_train)) # prediction x_test = np.array([[80, 70, 50]]) x_test = torch.from_numpy(x_test).float(); print(model(x_test))
2/2
Practical Example
XOR Problem
https://mc.ai/intro-to-deep-learning-with-pytorch-part-1/
XOR Problem
•
Minsky and Papert proved
torch.nn.Sequential
•
A container that contains other modules
•
It concatenates the a series of modules.
import numpy as np import torch X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]) Y = torch.FloatTensor([[0], [1], [1], [0]]) INPUT_DIM = 2 HIDDEN_DIM = 10 OUTPUT_DIM = 1 model = torch.nn.Sequential( torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), torch.nn.ReLU(), torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM),
torch.nn.Sigmoid()) # MUST for non-linearity
CostFunc = torch.nn.BCELoss() # Binary Cross Entropy Loss optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(10000): Y_predicted = model(X) cost = CostFunc(Y_predicted, Y) cost.backward() optimizer.step() model.zero_grad() Y_predicted = model(X) print(np.squeeze(Y_predicted.detach().numpy())) # [0.01351878 0.98831743 0.9887106 0.01278798] print(np.squeeze((Y_predicted+0.5).int().detach().numpy())) # [0 1 1 0]
import numpy as np import torch X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]) Y = torch.FloatTensor([[0], [1], [1], [0]]) INPUT_DIM = 2 HIDDEN_DIM = 10 OUTPUT_DIM = 1
linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) actfnc1 = torch.nn.ReLU()
linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM)
actfnc2 = torch.nn.Sigmoid() # MUST for non-linearity
model = torch.nn.Sequential(linear1, actfnc1, linear2, actfnc2)
CostFunc = torch.nn.BCELoss() # Binary Cross Entropy Loss optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
for epoch in range(10000): Y_predicted = model(X) cost = CostFunc(Y_predicted, Y) cost.backward() optimizer.step() model.zero_grad() Y_predicted = model(X) print(np.squeeze(Y_predicted.detach().numpy())) # [0.02886132 0.9477684 0.9471025 0.07047193] print(np.squeeze((Y_predicted+0.5).int().detach().numpy())) # [0 1 1 0]
class Model(torch.nn.Module)
•
It contains two main methods.
•
The first method(__init__) defines layers components of the network.
•
In the second method(forward) we wire the network and put every component in the desired
order.
import numpy as np import torch X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]) Y = torch.FloatTensor([[0], [1], [1], [0]]) INPUT_DIM = 2 HIDDEN_DIM = 10 OUTPUT_DIM = 1 class Model(torch.nn.Module):
def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super().__init__()
self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfnc1 = torch.nn.ReLU()
self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) self.actfnc2 = torch.nn.Sigmoid()
def forward(self, x):
x = self.actfnc1( self.linear1(x) ) x = self.actfnc2( self.linear2(x) ) return x
model = Model(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
CostFunc = torch.nn.BCELoss() # Binary Cross Entropy Loss optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
. . .
class Model(torch.nn.Module):
def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super().__init__()
self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def forward(self, x):
x = torch.relu ( self.linear1(x) ) x = torch.sigmoid( self.linear2(x) ) return x
class Model(torch.nn.Module):
def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super().__init__()
self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfnc1 = torch.nn.ReLU()
self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) self.actfnc2 = torch.nn.Sigmoid() def forward(self, x): x = self.actfnc1( self.linear1(x) ) x = self.actfnc2( self.linear2(x) ) return x
Method #1
Method #2
class Model(torch.nn.Module):
def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super().__init__()
self.layer1 = torch.nn.Sequential( torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), torch.nn.ReLU() ) self.layer2 = torch.nn.Sequential( torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM), torch.nn.Sigmoid() ) def forward(self, x):
x = self.layer1(x) x = self.layer2(x) return x
class Model(torch.nn.Module):
def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super().__init__()
self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def layer1(self, x):
return torch.relu( self.linear1(x) ) def layer2(self, x):
return torch.sigmoid( self.linear2(x) ) def forward(self, x): x = self.layer1(x) x = self.layer2(x) return x
Method #3
Method #4
MNIST
torchvision
•
The torchvision package consists of popular datasets, model architectures, and common
image transformations for compute vision.
‣
torchvision.datasets: MNIST, Fashion-MNIST, CIFAR, etc.
‣
torchvision.io: video
‣
torchvision.models: classification, object detection, etc.
torchvision.transform.Compose
•
It creates a series of transformations.
•
It compose several transformations together.
•
All the transformations in the Compose are applied to the input data one by one.
import torchvisionnormalize = torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
transformations = torchvision.transforms.Compose( [torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.RandomVerticalFlip(),
torchvision.transforms.ToTensor(), normalize] )
Image Normalization
•
Normalization reduces the skewness, and helps to learn faster and better.
•
If the given dataset is already in range [0.0, 1.0], you can skip the normalization.
•
image = ( image - mean ) / std
•
(mean, std) = (0.5, 0.5) ➜ image = ( image − 0.5 ) / 0.5
: (0.0, 1.0) range ➜ (−1.0, +1.0) range
•
torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
: Each 0.5 for each channel (Red, Green, Blue)
import numpy as np
import torch, torchvision
pixels = np.random.randint(low=0, high=256, size=(5, 5, 3)) # 5x5 RGB image
print(type(pixels)) # <class ‘numpy.ndarray'> print(np.min(pixels), ' ~ ', np.max(pixels)) # 0 ~ 255
pixels = pixels.astype('float32') / 255 # normalization: [0, 255] to [0.0, 1.0] print(type(pixels)) # <class ‘numpy.ndarray'>
print(np.min(pixels), ' ~ ', np.max(pixels)) # 0.0 ~ 1.0
image = torch.from_numpy(pixels) # to a tensor
print(type(image)) # <class ‘torch.Tensor'> print(torch.min(image).item(), ' ~ ', torch.max(image).item()) # 0.0 ~ 1.0
# transforms = torch vision.transforms.ToTensor()
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
image = transforms(pixels) # apply transformations print(type(image)) # <class ‘torch.Tensor'> print(torch.min(image).item(), ' ~ ', torch.max(image).item()) # 0.0 ~ 1.0
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) image = transforms(pixels) # apply transformations
print(type(image)) # <class ‘torch.Tensor'> print(torch.min(image).item(), ' ~ ', torch.max(image).item()) # -1.0 ~ 1.0
image = torchvision.transforms.functional.normalize(image, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
MNIST
import numpy as np
import torch, torchvision
from matplotlib import pyplot as plt
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()]) bs = 64 # batch size
train_dataset = torchvision.datasets.MNIST(root='./data/', train=True, transform=transforms, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data/', train=False, transform=transforms)
print(train_dataset.data.shape) # torch.Size([60000, 28, 28]) print(len(train_dataset), len(test_dataset)) # 60000 10000
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=bs, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=bs, shuffle=True)
print(train_dataloader.dataset.data.shape) # torch.Size([60000, 28, 28])
print(len(train_dataloader), len(test_dataloader)) # 938=round(60000/bs), 157=round(10000/bs) print(len(train_dataloader.dataset), len(test_dataloader.dataset)) # 60000 10000
MNIST
for batch_index, (images, labels) in enumerate(train_dataloader): print(labels.shape) # torch.Size([64])
print(images.shape) # torch.Size([64, 1, 28, 28]) print(images[0].shape) # torch.Size([1, 28, 28])
images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 28, 28, 1])
print(images[0].shape) # torch.Size([28, 28, 1])
plt.imshow(images[0].reshape((28, 28)), cmap=‘gray') plt.show()
break
for batch_index in range(len(train_dataset)): itr = iter(train_dataloader)
images, labels = itr.next()
print(labels.shape) # torch.Size([64])
print(images.shape) # torch.Size([64, 1, 28, 28]) print(images[0].shape) # torch.Size([1, 28, 28])
images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 28, 28, 1]) print(images[0].shape) # torch.Size([28, 28, 1]) plt.imshow(images[0].reshape((28, 28)), cmap=‘gray') plt.show() break
2/2
CIFAR-10
import numpy as np
import torch, torchvision
from matplotlib import pyplot as plt
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()]) bs = 64 # batch size
train_dataset = torchvision.datasets.CIFAR10(root='./data/CIFAR10', train=True, transform=transforms, download=True)
test_dataset = torchvision.datasets.CIFAR10(root='./data/CIFAR10', train=False, transform=transforms)
print(train_dataset.data.shape) # (50000, 32, 32, 3)
print(len(train_dataset), len(test_dataset)) # 50000 10000
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=bs, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=bs, shuffle=True)
print(train_dataloader.dataset.data.shape) # (50000, 32, 32, 3)
print(len(train_dataloader), len(test_dataloader)) # 782=round(50000/bs), 157=round(10000/bs) print(len(train_dataloader.dataset), len(test_dataloader.dataset)) # 50000 10000
CIFAR-10
for batch_index, (images, labels) in enumerate(train_dataloader): print(labels.shape) # torch.Size([64])
print(images.shape) # torch.Size([64, 3, 32, 32]) print(images[0].shape) # torch.Size([3, 32, 32])
images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 32, 32, 3])
print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3)))
plt.show() break
for batch_index in range(len(train_dataset)): itr = iter(train_dataloader)
images, labels = itr.next()
print(labels.shape) # torch.Size([64])
print(images.shape) # torch.Size([64, 3, 32, 32]) print(images[0].shape) # torch.Size([3, 32, 32])
images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 32, 32, 3]) print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3))) plt.show() break
2/2
CIFAR-100
import numpy as np
import torch, torchvision
from matplotlib import pyplot as plt
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()]) bs = 64 # batch size
train_dataset = torchvision.datasets.CIFAR100(root='./data/CIFAR100', train=True, transform=transforms, download=True)
test_dataset = torchvision.datasets.CIFAR100(root=‘./data/CIFAR100', train=False, transform=transforms)
print(train_dataset.data.shape) # (50000, 32, 32, 3)
print(len(train_dataset), len(test_dataset)) # 50000 10000
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=bs, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=bs, shuffle=True)
print(train_dataloader.dataset.data.shape) # (50000, 32, 32, 3)
print(len(train_dataloader), len(test_dataloader)) # 782=round(50000/bs), 157=round(10000/bs) print(len(train_dataloader.dataset), len(test_dataloader.dataset)) # 50000 10000
CIFAR-100
for batch_index, (images, labels) in enumerate(train_dataloader): print(labels.shape) # torch.Size([64])
print(images.shape) # torch.Size([64, 3, 32, 32]) print(images[0].shape) # torch.Size([3, 32, 32])
images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 32, 32, 3])
print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3)))
plt.show() break
for batch_index in range(len(train_dataset)): itr = iter(train_dataloader)
images, labels = itr.next()
print(labels.shape) # torch.Size([64])
print(images.shape) # torch.Size([64, 3, 32, 32]) print(images[0].shape) # torch.Size([3, 32, 32])
images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 32, 32, 3]) print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3))) plt.show() break
2/2
Accuracy Evaluation
import torch
data = torch.randn(3, 10) print(data.numpy())
values, indices = torch.max(data.data, 0) print(values.numpy(), indices.numpy())
values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())
accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1.3683
3 × 10 tensor
-3.7282 -0.7150 -0.0891 -0.1889 -0.7913 -0.8726 -0.1097 1.9349 0.1923 -0.4194 -1.9478 0.7762 0.9239 0.3441 0.0412 0.5557 0.9953 2.2492 -0.4234 1.8282 0.1751 1.1866 0.7951 -0.8284 0.1223 -1.005 -1.3221 1.8195 -0.6615Accuracy Evaluation
import torch
data = torch.randn(3, 10) print(data.numpy())
values, indices = torch.max(data.data, 0) print(values.numpy(), indices.numpy())
values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())
accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1.3683
3 × 10 tensor
-3.7282 -0.7150 -0.0891 -0.1889 -0.7913 -0.8726 -0.1097 1.9349 0.1923 -0.4194 -1.9478 0.7762 0.9239 0.3441 0.0412 0.5557 0.9953 2.2492 -0.4234 1.8282 0.1751 1.1866 0.7951 -0.8284 0.1223 -1.005 -1.3221 1.8195 -0.6615 1.8282 2values
indices
0.1751 2 1.1866 2 0.9239 1 0.3441 1 0.1223 2 0.5557 1 0.9953 1 2.2492 1 0.1923 0max.
Accuracy Evaluation
import torch
data = torch.randn(3, 10) print(data.numpy())
values, indices = torch.max(data.data, 0) print(values.numpy(), indices.numpy())
values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())
accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1.3683
3 × 10 tensor
-3.7282 -0.7150 -0.0891 -0.1889 -0.7913 -0.8726 -0.1097 1.9349 0.1923 -0.4194 -1.9478 0.7762 0.9239 0.3441 0.0412 0.5557 0.9953 2.2492 -0.4234 1.8282 0.1751 1.1866 0.7951 -0.8284 0.1223 -1.005 -1.3221 1.8195 -0.6615 1.9349 8values
indices
2.2492 8 1.8282 0max.
Accuracy Evaluation
import torch
data = torch.randn(3, 10) print(data.numpy())
values, indices = torch.max(data.data, 0) print(values.numpy(), indices.numpy())
values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())
accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1 1
a
b
0 1 1 1 0 1 1 1 0 1Accuracy Evaluation
import torch
data = torch.randn(3, 10) print(data.numpy())
values, indices = torch.max(data.data, 0) print(values.numpy(), indices.numpy())
values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())
accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1 1
a
b
0 1 1 1 0 1 1 1 0 1 1c
c.sum().item()
➜
3
0 1 0 1 0Accuracy Evaluation
import torch
data = torch.randn(3, 10) print(data.numpy())
values, indices = torch.max(data.data, 0) print(values.numpy(), indices.numpy())
values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())
accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1 1
a
b
0 1 1 1 0 1 1 1 0 1 1c
c.sum().item()
➜
3
accuracy
=
50.0 %
0 1 0 1 0MNIST Classification
import torch, torchvision
DEVICE = 'cuda' if torch.cuda.is_available() else ‘cpu' INPUT_DIM = 784 # = 28 x 28
HIDDEN_DIM = 100
OUTPUT_DIM = 10 # the number of classes TOTAL_EPOCHS = 10
LEARNING_RATE = 0.01 BATCH_SIZE = 2000
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
train_dataset = torchvision.datasets.MNIST(root='./data/', train=True, transform=transforms, download=True)
test_dataset = torchvision.datasets.MNIST(root='./data/', train=False, transform=transforms)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True)
MNIST Classification
class Model(torch.nn.Module):
def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super(Model, self).__init__()
self.layer1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfn1 = torch.nn.ReLU()
self.layer2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def forward(self, x):
y1 = self.actfn1(self.layer1(x)) y2 = self.layer2(y1)
return y2
model = Model(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM).to(DEVICE)
CostFunc = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
MNIST Classification
for epoch in range(TOTAL_EPOCHS):
for images, labels in train_dataloader:
images = images.reshape(-1, 784).to(DEVICE) # flattening labels = labels.to(DEVICE)
output = model(images)
cost = CostFunc(output, labels) cost.backward()
optimizer.step()
optimizer.zero_grad()
print('Cost: {:.4f}’.format(cost.item()))
# for the test, you don't need to do the gradient computation. with torch.no_grad():
correct = 0
for images, labels in test_dataloader:
images = images.reshape(-1, 784).to(DEVICE) # flattening labels = labels.to(DEVICE)
output = model(images)
_, predicted = torch.max(output.data, 1)
correct += (predicted == labels).sum().item()
print('Accuracy: {} %'.format(100 * correct / 10000)) # Accuracy: 97.36 %
CIFAR-10 Classification
import torch, torchvision
DEVICE = 'cuda' if torch.cuda.is_available() else ‘cpu' INPUT_DIM = 3072 # = 32 x 32 x 3
HIDDEN_DIM = 100
OUTPUT_DIM = 10 # the number of classes TOTAL_EPOCHS = 10
LEARNING_RATE = 0.01 BATCH_SIZE = 2000
transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
train_dataset = torchvision.datasets.CIFAR10(root='./data/CIFAR10', train=True, transform=transforms, download=True)
test_dataset = torchvision.datasets.CIFAR10(root=‘./data/CIFAR10', train=False, transform=transforms)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True)
CIFAR-10 Classification
class Model(torch.nn.Module):
def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super(Model, self).__init__()
self.layer1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfn1 = torch.nn.ReLU()
self.layer2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def forward(self, x):
y1 = self.actfn1(self.layer1(x)) y2 = self.layer2(y1)
return y2
model = Model(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM).to(DEVICE)
CostFunc = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
CIFAR-10 Classification
for epoch in range(TOTAL_EPOCHS):
for images, labels in train_dataloader:
images = images.reshape(-1, 3072).to(DEVICE) # flattening labels = labels.to(DEVICE)
output = model(images)
cost = CostFunc(output, labels) cost.backward()
optimizer.step()
optimizer.zero_grad()
print('Cost: {:.4f}’.format(cost.item()))
# for the test, you don't need to do the gradient computation. with torch.no_grad():
correct = 0
for images, labels in test_dataloader:
images = images.reshape(-1, 3072).to(DEVICE) # flattening labels = labels.to(DEVICE)
output = model(images)
_, predicted = torch.max(output.data, 1)
correct += (predicted == labels).sum().item()
print('Accuracy: {} %'.format(100 * correct / 10000)) # Accuracy: 21.15 %
Problems
•
Slow convergence
•
Too low accuracy
•
Flattened input data
•
In this process, the spatial information is lost.
CNN: Convolutional Neural Network
input image
feature maps
(n channels)
feature maps
(m channels)
activation maps
(m channels)
layer softmax
FC
dog
cat
bird
!
convolution
with n kernels
pooling
convolution
with m kernels
pooling
activation maps
(n channels)
CNN for MNIST
class Model(torch.nn.Module): def __init__(self): super(Model, self).__init__() # (bs, 28, 28, 1) -> conv -> (bs, 28, 28, 6) -> pool -> (bs, 14, 14, 6) self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2), torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2), torch.nn.Dropout(1 - keep_ratio))
# (bs, 14, 14, 6) -> conv -> (bs, 14, 14, 9) -> pool -> (bs, 7, 7, 9)
self.layer2 = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=6, out_channels=9, kernel_size=3, stride=1, padding=1), torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2), torch.nn.Dropout(1 - keep_ratio))
CNN for MNIST
# FC: 7x7x9 -> 256 self.layer3 = torch.nn.Sequential( torch.nn.Linear(7*7*9, 256), torch.nn.ReLU()) # FC: 256 -> 128 self.layer4 = torch.nn.Sequential( torch.nn.Linear(256, 128), torch.nn.ReLU()) # FC: 128 -> 10 self.layer5 = torch.nn.Linear(128, 10) def forward(self, x): x = self.layer1(x) x = self.layer2(x) x = x.reshape(x.size(0), -1) # flattening: 7x7x9 -> 441(=7x7x9) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) return x2/2
CNN for CIFAR-10
class Model(torch.nn.Module): def __init__(self): super(Model, self).__init__() # (bs, 32, 32, 3) -> conv -> (bs, 32, 32, 6) -> pool -> (bs, 16, 16, 6) self.layer1 = torch.nn.Sequential(torch.nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5, stride=1, padding=2), torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2), torch.nn.Dropout(1 - keep_ratio))
# (bs, 16, 16, 6) -> conv -> (bs, 16, 16, 9) -> pool -> (bs, 8, 8, 9)
self.layer2 = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=6, out_channels=9, kernel_size=3, stride=1, padding=1), torch.nn.ReLU(),
torch.nn.MaxPool2d(kernel_size=2, stride=2), torch.nn.Dropout(1 - keep_ratio))