PyTorch

(1)

PyTorch

Wanho Choi

(2)

(3)

(4)

PyTorch

Open source and free machine learning library based on the Torch library

Developed by FAIR’s (Facebook AI Research) Group

(5)

PyTorch vs TensorFlow

• PyTorch

‣

Define-and-Run

• TensorFlow

‣

Define-by-Run

: peculiar framework

(6)

PyTorch vs TensorFlow

(7)

(8)

Version

import torch

print(torch.__version__) # 1.1.0

Facebook CTO Mike Schroepfer announces the release of PyTorch 1.0 at Facebook developer conference F8 on May 2, 2018 at the McEnery Convention Center in San Jose, California (Image Credit: Facebook)

(9)

Tensor

• Simply

‣

Multi-dimensional array

‣

Generalized matrix

• Strictly

‣

Matrix: just a collection of numbers inside brackets

‣

Tensors have some transformation properties when changing coordinate system.

• In PyTorch

‣

Unit of data

(10)

Tensor

0D tensor

1D tensor

4D tensor

2D tensor

3D tensor

scalar

vector

matrix

cube

a vector of cube

(11)

Tensor

0D tensor

1D tensor

4D tensor

2D tensor

3D tensor

variable

array

gray scale image

RGB image

(12)

Tensor: shape

import torch

a = torch.tensor(3.14)

print(a) # tensor(3.1400)

print(a.shape, a.size()) # torch.Size([]) torch.Size([]) b = torch.tensor([1.414])

print(b) # tensor([1.4140])

print(b.shape, b.size()) # torch.Size([1]) torch.Size([1]) c = torch.tensor([1., 2., 3.])

print(c) # tensor([1., 2., 3.])

print(c.shape, c.size()) # torch.Size([3]) torch.Size([3]) d = torch.tensor([[1, 2], [3, 4], [5, 6]])

print(d) # tensor([[1, 2], [3, 4], [5, 6]])

print(d.shape, d.size()) # torch.Size([3, 2]) torch.Size([3, 2]) e = torch.tensor([[[1, 2, 3], [3, 4, 5]], [[5, 6, 7], [7, 8, 9]]])

print(e) # tensor([[[1, 2, 3], [3, 4, 5]], [[5, 6, 7], [7, 8, 9]]]) print(e.shape, e.size()) # torch.Size([2, 2, 3]) torch.Size([2, 2, 3])

print(e.shape[0], e.shape[1], e.shape[2]) # 2 2 3 print(e.size(0), e.size(1), e.size(2)) # 2 2 3

(13)

Tensor: dtype

import numpy as np import torch a = np.array([[1, 2], [3, 4]]) print(a.dtype) # int64 b = np.array([[1., 2.], [3., 4.]]) print(b.dtype) # float64 aa = torch.from_numpy(a) print(aa.dtype) # torch.int64 bb = torch.from_numpy(b) print(bb.dtype) # torch.float64 aa = torch.from_numpy(a).float() print(aa.dtype) # torch.float32 aa = torch.FloatTensor(a) print(aa.dtype) # torch.float32 a = aa.int() print(a.dtype) # torch.int32

(14)

Tensor: data

import torch

a = torch.tensor([[1, 2], [3, 4]])

print(type(a)) # <class ‘torch.Tensor'> print(a) # tensor([[1, 2], [3, 4]]) print(a[0][0], a[0][1]) # tensor(1) tensor(2)

print(a[1][0], a[1][1]) # tensor(3) tensor(4) print(a[0][0].item()) # 1

print(a[0][1].item()) # 2 print(a[1][0].item()) # 3 print(a[1][1].item()) # 4

b = a.data

print(type(b)) # <class ‘torch.Tensor'> print(b) # tensor([[1, 2], [3, 4]]) print(b[0][0], b[0][1]) # tensor(1) tensor(2)

print(b[1][0], b[1][1]) # tensor(3) tensor(4) print(b[0][0].item()) # 1

print(b[0][1].item()) # 2 print(b[1][0].item()) # 3 print(b[1][1].item()) # 4

(15)

Interoperability with NumPy

import numpy as np import torch

a = [[1, 2], [3, 4]]

print(type(a)) # <class ‘list> b = np.array(a)

print(type(b)) # <class ‘numpy.ndarray> c = torch.tensor(b)

print(type(c)) # <class ‘torch.Tensor> c = torch.from_numpy(b)

print(type(c)) # <class ‘torch.Tensor> c = torch.as_tensor(b)

print(type(c)) # <class ‘torch.Tensor> d = c.numpy()

print(type(d)) # <class 'numpy.ndarray>

NumPy

Array

PyTorch

Tensor

b = torch.

from_numpy

(a)

a = b.

numpy

()

b = torch.

tensor

(a)

(16)

Copy vs Reference

• PyTorch tensors share the memory buﬀer of NumPy ndarrays.

• Thus, changing one will aﬀects the others.

• torch.tensor() allocates a new memory by copying the ndarray.

• torch.tensor() is an alias for the default tensor type, e.g. torch.FloatTensor().

• If you want avoid a copy, use torch.from_numpy() or torch.as_tensor().

(17)

Copy vs Reference

import numpy as np import torch a = np.array([[1.]]) b = torch.tensor(a) # copy c = torch.FloatTensor(a) # copy d = torch.from_numpy(a) # reference e = torch.as_tensor(a) # reference print(b.item()) # 1.0 print(c.item()) # 1.0 print(d.item()) # 1.0 print(e.item()) # 1.0 a[0][0] = 2 print(b.item()) # 1.0 print(c.item()) # 1.0 print(d.item()) # 2.0 print(e.item()) # 2.0 a[0][0] = 0 b[0][0] = 111 print(a[0][0]) # 0.0 c[0][0] = 222 print(a[0][0]) # 0.0 d[0][0] = 333 print(a[0][0]) # 333 e[0][0] = 444 print(a[0][0]) # 444

a = np.array([[12345.]]) # a new object print(b.item()) # 111.0 print(c.item()) # 222.0 print(d.item()) # 333.0 print(e.item()) # 444.0

2/2

1/2

(18)

Tensors on GPU

import numpy as np import torch a = np.array([1, 2, 3]) c = torch.from_numpy(a).float() print(c) # tensor([1., 2., 3.]) c = torch.from_numpy(a).float().to(‘cpu’) print(c) # tensor([1., 2., 3.]) g = torch.from_numpy(a).float().to(‘cuda’)

print(g) # tensor([1., 2., 3.], device=‘cuda:0’)

DEVICE = ‘cuda’ if torch.cuda.is_available() else ‘cpu’ g = torch.from_numpy(a).float().to(DEVICE)

print(g) # tensor([1., 2., 3.], device=‘cuda:0’) if torch.cuda.is_available():

print(torch.cuda.device_count()) # 1: the number of GPUs available

print(torch.cuda.current_device()) # 0: the index of a currently selected device

(19)

(20)

Random Tensor

import torch a = torch.randn(1) # 1 by 1 tensor b = torch.randn(1) # 1 by 1 tensor print(a, b) c = torch.randn(2, 3) # 2 by 3 tensor print(c)

# above this you will get different results for each run torch.manual_seed(123)

# blow this will always produce the same results a = torch.randn(1) # 1 by 1 tensor b = torch.randn(1) # 1 by 1 tensor print(a, b) c = torch.randn(2, 3) # 2 by 3 tensor print(c)

Uniform Distribution

frequency

torch.rand()

68%

95%

Normal Distribution

frequency

torch.randn()

(21)

Matrix Multiplication

[

1 2

3 4

5 6] [

1 2 3

4 5 6] =

9 12 15

19 26 33

29 40 51

3 × 3

3 × 2

2 × 3

import torch a = torch.tensor([[1, 2], [3, 4], [5, 6]]) print(a.shape) # torch.Size([3, 2]) b = torch.tensor([[1, 2, 3], [4, 5, 6]]) print(b.shape) # torch.Size([2, 3]) c = torch.mm(a, b) print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]]) c = a.mm(b) print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]]) c = torch.matmul(a, b) print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]]) c = a @ b print(c) # tensor([[ 9, 12, 15], [19, 26, 33], [29, 40, 51]])

(22)

Autograd: Automatic Diﬀerentiation

An engine for computing gradients (or Jacobians).

Autograd automatically calculates the gradients (or Jacobians) by applying chain rules,

tracing from the root to the leaves of the graph.

If you set a Tensor’s attribute .requires_grad as True, it starts to track all operations on it.

When you finish your computation you can call backward() and have all the gradients (or

Jacobians) computed automatically.

The result will be accumulated into .grad attribute.

(23)

.requires_grad

Every Tensor has

a flag: .requires_grad.

Default value: False

If

False, the tensor will be excluded from the gradient computation.

Tensors that must be

included in the gradient computation must explicitly set this flag to True.

input.requires_grad = False ➜ output.requires_grad = False (automatically)

input.requires_grad = True ➜ output.requires_grad = True (automatically)

This flag can also be set when the tensor is first created, and

can be changed later.

(24)

.requires_grad

import torch

DEVICE = 'cuda' if torch.cuda.is_available() else ‘cpu' a = torch.rand(1, requires_grad=True).to(DEVICE)

b = torch.rand(1, requires_grad=True).to(DEVICE)

print(a.requires_grad, b.requires_grad) # True True a = torch.rand(1, dtype=torch.float).to(DEVICE)

b = torch.rand(1, dtype=torch.float).to(DEVICE)

print(a.requires_grad, b.requires_grad) # False False a.requires_grad_()

b.requires_grad_()

(25)

.requires_grad

import torch

x = torch.tensor([1.0]) # requires_grad=False by default y = torch.tensor([2.0]) # requires_grad=False by default z = torch.tensor([3.0], requires_grad=True) a = x + y print(a.requires_grad) # False b = a + z print(b.requires_grad) # True a.requires_grad_(True) print(a.requires_grad) # True

1 x

2 y

3 z

3 a

6 b

False

True

(26)

Autograd

import torch

x = torch.tensor(2.0, requires_grad=True)

print(x) # tensor(2., requires_grad=True) y = 3*x*x + 4*x + 5

print(y) # tensor(25., grad_fn=<AddBackward0>) print(x.grad) # None

y.backward() # compute gradients print(x.grad) # tensor(16.)

x = 2.0

16 x = 2

y = 3x

2 + 4x + 5

∂y

∂x

= 6x + 4

x = 2

25

(27)

Autograd

import torch x = torch.ones(2, 2, requires_grad=True) y = x + 3 z = y*y + 1 f = z.mean()

print(f) # tensor(17., grad_fn=<AddBackward0>) f.backward() # gradient computation

print(x.grad) # tensor([[2., 2.], [2., 2.]])

[

1 1

_{1 1]}

[

4 4

_{4 4]}

[

17 17

_{17 17]}

17 y = x + 3

z = y

2

+ 1

f = 1

₄

_∑

4 i=1

z

_i x y z f

∂f

∂x

_i

= ∂f

∂z

_i

∂z

_i

∂y

_i

∂y

_i

∂x

_i

= 1

4 × 2y × 1 = 1

2 y = 1

2 (x + 3)

when

x = 1

∂f

∂x

_i

= 2

(28)

Autograd

import torch x = torch.tensor(3.0) w = torch.tensor(4.0, requires_grad=True) b = torch.tensor(5.0, requires_grad=True) print(x.item()) # 3.0 print(w.item()) # 4.0 print(b.item()) # 5.0 y = w * x + b print(y.item()) # 17.0

y.backward() # gradient computation print(w.grad.item()) # 3.0 print(b.grad.item()) # 1.0

y = wx + b

∂y

∂w

= x

∂y

∂b

= 1

17 = 4 × 3 + 5

(29)

no_grad()

import torch x = torch.randn(3, requires_grad=True) print(x.requires_grad) # True y = x + 1 print(y.requires_grad) # True

# not to calculate the gradient for the variable z

with torch.no_grad():

z = x + 1

(30)

Tensor vs Variable

Variables are wrappers for Tensors.

Variable = Tensor + (= gradient computation)

Variables are the part of the autograd package.

The Variable API has been deprecated:

Variables are no longer necessary to use autograd with Tensors.

Autograd automatically supports Tensors with .requires_grad to True.

(31)

In-Place Operations

In PyTorch, many methods exist in two versions:

: with / without an underscore(_) suﬃx

ex) add(…), add_(…)

The underscore(_) indicates in-place operations in PyTorch.

Methods that ends in an underscore(_) change the tensor in-place.

In general, in-place operations increase performance, but can lead to problems and worse

performance in PyTorch.

It is recommended not to use in-place operations in most cases for eﬃciency.

(32)

In-Place Operations

(33)

In-Place Operations

import torch

a = torch.tensor([1, 2, 3])

print(id(a)) # THE RESULT: (1) a += 10

print(id(a)) # THE RESULT: (2) a = a + 10

print(id(a)) # THE RESULT: (3)

# (1) and (2) are same, but (3) different from them. # (2): in-place operator (same object)

# (3): a new object was created a = torch.tensor([1, 2, 3]) b = a + 10 print(b) # tensor([11, 12, 13]) print(a is b) # False b = a.add(10) print(b) # tensor([11, 12, 13]) print(a is b) # False b = a.add_(10) print(b) # tensor([11, 12, 13]) print(a is b) # True

(34)

(35)

_i

∂b

= E

i

⋅ (−1) = − E

i

∑

i=1

(−E

_i

)

(37)

Pure NumPy Implementation

import numpy as np # y = w*x+b: w=2, b=1 x = np.array([1, 2, 3, 4, 5], dtype=‘float32') y = np.array([3, 5, 7, 9, 11], dtype=‘float32') w = np.random.randn(1) # weight b = np.random.randn(1) # bias

for epoch in range(10000): # iteration # prediction

y_predicted = w * x + b error = y - y_predicted

# gradient computation (manually) w_grad = (-x * error).mean() b_grad = (-error).mean() # update w -= 0.01 * w_grad b -= 0.01 * b_grad print(w, b) # [2.00000001] [0.99999998]

∂C

∂w

=

N

∑

i=1

(−x

_i

E

_i

)

∂C

∂b

− y

_i

(38)

Pure NumPy Implementation

import numpy as np # y = w*x+b: w=2, b=1 x = np.array([1, 2, 3, 4, 5], dtype=‘float32') y = np.array([3, 5, 7, 9, 11], dtype=‘float32') w = np.random.randn(1) # weight b = np.random.randn(1) # bias

y_predicted = w * x + b error = y - y_predicted

# gradient computation (manually) w_grad = (-x * error).mean() b_grad = (-error).mean() # update w -= 0.01 * w_grad b -= 0.01 * b_grad print(w, b) # [2.00000001] [0.99999998]

w

n+1

= w

n

− α ∂E

∂w

b

n+1

= b

n

− α ∂E

y

_i

)

N

[

a

b]

=

∑

N i=1

x

i

y

i

∑

N_i=1

y

_i

Ax

= b

f(x) = 1

2 x

T

Ax − b

T

x + c

∂f

∂x

= 0

Ax = b

_a

b

(2,1)

energy contours steepest direction

initial random guess

(40)

LinearRegressionCommon.py

# how many points? N = 1000

# the ground-truth values W = 2.0 # weight

B = 1.0 # bias

def Data():

np.random.seed(13) # random seed

x = np.random.rand(N, 1) # input: 1D array

noise = 0.1 * np.random.rand(N, 1) # noise: 1D array

y = (W * x + B) + noise # outout that has some noise

indices = np.arange(N) # point indices

np.random.shuffle(indices) # shuffled indices

train_indices = indices[:round(0.8*N)] # the first 80 random indices for train set

valid_indices = indices[round(0.8*N):] # the the remaining indices for validation set x_train, y_train = x[train_indices], y[train_indices]

x_valid, y_valid = x[valid_indices], y[valid_indices] return x_train, y_train, x_valid, y_valid

(41)

Example #1

from LinearRegressionCommon import *

w = np.random.randn(1) # weight

b = np.random.randn(1) # bias

x_train, y_train, _, _ = Data()

y_predicted = w * x_train + b error = y_train - y_predicted

# gradient computation (manually) w_grad = (-x_train * error).mean() b_grad = (-error).mean()

# update

w -= 0.01 * w_grad b -= 0.01 * b_grad

(42)

Example #2

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

w = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) b = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE)

x_train, y_train, _, _ = Data()

x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE)

for epoch in range(10000):

y_predicted = w * x_train + b error = y_train - y_predicted cost = (error*error).mean()

cost.backward() # gradient computation (automatically)

with torch.no_grad(): w -= 0.01 * w.grad b -= 0.01 * b.grad w.grad.zero_() b.grad.zero_() print(w.item(), b.item()) # 2.003204107284546 1.048282265663147

(43)

Example #3

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

w = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) b = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) x_train, y_train, _, _ = Data()

x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE)

optimizer = torch.optim.SGD([w, b], lr=0.01)

for epoch in range(10000):

y_predicted = w * x_train + b error = y_train - y_predicted cost = (error*error).mean()

cost.backward() optimizer.step()

optimizer.zero_grad()

(44)

Example #4

w = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) b = torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE) x_train, y_train, _, _ = Data()

x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE)

CostFunc = torch.nn.MSELoss()

optimizer = torch.optim.SGD([w, b], lr=0.01) for epoch in range(10000):

y_predicted = w * x_train + b

cost = CostFunc(y_train, y_predicted) cost.backward()

optimizer.step()

optimizer.zero_grad()

(45)

Example #5

class Model(torch.nn.Module): def __init__(self):

super().__init__()

self.w = torch.nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE)) self.b = torch.nn.Parameter(torch.randn(1, requires_grad=True, dtype=torch.float, device=DEVICE)) def forward(self, x):

return self.w * x + self.b

x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE) model = Model().to(DEVICE) CostFunc = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

1/2

(46)

Example #5

for epoch in range(10000): model.train()

y_predicted = model(x_train)

cost = CostFunc(y_train, y_predicted) cost.backward() optimizer.step() optimizer.zero_grad() print(model.state_dict()) print(model.w, model.b) print(model.w.item(), model.b.item()) # 2.003204107284546 1.048282265663147

2/2

(47)

Example #6

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' class Model(torch.nn.Module): def __init__(self): super().__init__() self.layer = torch.nn.Linear(1, 1) def forward(self, x): return self.layer(x)

x_train = torch.from_numpy(x_train).float().to(DEVICE) y_train = torch.from_numpy(y_train).float().to(DEVICE) model = Model().to(DEVICE) CostFunc = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

1/2

(48)

Example #6

for epoch in range(10000): model.train()

y_predicted = model(x_train)

cost = CostFunc(y_train, y_predicted) cost.backward() optimizer.step() optimizer.zero_grad() print(model.state_dict()) print(model.layer.weight, model.layer.bias) print(model.layer.weight.item(), model.layer.bias.item()) # 2.003204107284546 1.048282265663147

2/2

(49)

Practical Example

https://medium.com/dsnet/linear-regression-with-pytorch-3dde91d60b50

Input

Layer

Hidden

Layer

Output

Layer

(50)

class Model(torch.nn.Module): def __init__(self):

super().__init__()

self.layer = torch.nn.Linear(3, 2) # 3: inputs, 2: outputs def forward(self, x):

return self.layer(x)

# input data: (temperature, rainfall, humidity)

x_train = np.array([[73, 67, 43], [91, 88, 64], [87, 134, 58], [102, 43, 37], [69, 96, 70]])

# output data: (apples, oranges)

y_train = np.array([[56, 70], [81, 101], [119, 133], [22, 37], [103, 119]]) x_train = torch.from_numpy(x_train).float() y_train = torch.from_numpy(y_train).float() model = Model() CostFunc = torch.nn.MSELoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

1/2

Practical Example

(51)

for epoch in range(10000): model.train()

y_predicted = model(x_train)

cost = CostFunc(y_train, y_predicted) cost.backward() optimizer.step() optimizer.zero_grad() # test print(model(x_train)) # prediction x_test = np.array([[80, 70, 50]]) x_test = torch.from_numpy(x_test).float(); print(model(x_test))

2/2

Practical Example

(52)

(53)

XOR Problem

https://mc.ai/intro-to-deep-learning-with-pytorch-part-1/

(54)

XOR Problem

• Minsky and Papert proved

(55)

torch.nn.Sequential

• A container that contains other modules

• It concatenates the a series of modules.

(56)

import numpy as np import torch X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]) Y = torch.FloatTensor([[0], [1], [1], [0]]) INPUT_DIM = 2 HIDDEN_DIM = 10 OUTPUT_DIM = 1 model = torch.nn.Sequential( torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), torch.nn.ReLU(), torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM),

torch.nn.Sigmoid()) # MUST for non-linearity

CostFunc = torch.nn.BCELoss() # Binary Cross Entropy Loss optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

for epoch in range(10000): Y_predicted = model(X) cost = CostFunc(Y_predicted, Y) cost.backward() optimizer.step() model.zero_grad() Y_predicted = model(X) print(np.squeeze(Y_predicted.detach().numpy())) # [0.01351878 0.98831743 0.9887106 0.01278798] print(np.squeeze((Y_predicted+0.5).int().detach().numpy())) # [0 1 1 0]

(57)

import numpy as np import torch X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]) Y = torch.FloatTensor([[0], [1], [1], [0]]) INPUT_DIM = 2 HIDDEN_DIM = 10 OUTPUT_DIM = 1

linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) actfnc1 = torch.nn.ReLU()

linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM)

actfnc2 = torch.nn.Sigmoid() # MUST for non-linearity

model = torch.nn.Sequential(linear1, actfnc1, linear2, actfnc2)

for epoch in range(10000): Y_predicted = model(X) cost = CostFunc(Y_predicted, Y) cost.backward() optimizer.step() model.zero_grad() Y_predicted = model(X) print(np.squeeze(Y_predicted.detach().numpy())) # [0.02886132 0.9477684 0.9471025 0.07047193] print(np.squeeze((Y_predicted+0.5).int().detach().numpy())) # [0 1 1 0]

(58)

class Model(torch.nn.Module)

• It contains two main methods.

• The first method(init) defines layers components of the network.

• In the second method(forward) we wire the network and put every component in the desired

order.

(59)

import numpy as np import torch X = torch.FloatTensor([[0, 0], [0, 1], [1, 0], [1, 1]]) Y = torch.FloatTensor([[0], [1], [1], [0]]) INPUT_DIM = 2 HIDDEN_DIM = 10 OUTPUT_DIM = 1 class Model(torch.nn.Module):

def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super().__init__()

self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfnc1 = torch.nn.ReLU()

self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) self.actfnc2 = torch.nn.Sigmoid()

def forward(self, x):

x = self.actfnc1( self.linear1(x) ) x = self.actfnc2( self.linear2(x) ) return x

model = Model(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)

. . .

(60)

class Model(torch.nn.Module):

self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def forward(self, x):

x = torch.relu ( self.linear1(x) ) x = torch.sigmoid( self.linear2(x) ) return x

self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfnc1 = torch.nn.ReLU()

self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) self.actfnc2 = torch.nn.Sigmoid() def forward(self, x): x = self.actfnc1( self.linear1(x) ) x = self.actfnc2( self.linear2(x) ) return x

Method #1

Method #2

(61)

self.layer1 = torch.nn.Sequential( torch.nn.Linear(INPUT_DIM, HIDDEN_DIM), torch.nn.ReLU() ) self.layer2 = torch.nn.Sequential( torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM), torch.nn.Sigmoid() ) def forward(self, x):

x = self.layer1(x) x = self.layer2(x) return x

self.linear1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.linear2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def layer1(self, x):

return torch.relu( self.linear1(x) ) def layer2(self, x):

return torch.sigmoid( self.linear2(x) ) def forward(self, x): x = self.layer1(x) x = self.layer2(x) return x

Method #3

Method #4

(62)

MNIST

(63)

torchvision

• The torchvision package consists of popular datasets, model architectures, and common

image transformations for compute vision.

‣

torchvision.datasets: MNIST, Fashion-MNIST, CIFAR, etc.

‣

torchvision.io: video

‣

torchvision.models: classification, object detection, etc.

(64)

torchvision.transform.Compose

• It creates a series of transformations.

• It compose several transformations together.

• All the transformations in the Compose are applied to the input data one by one.

import torchvision

normalize = torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

transformations = torchvision.transforms.Compose( [torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.RandomVerticalFlip(),

torchvision.transforms.ToTensor(), normalize] )

(65)

Image Normalization

• Normalization reduces the skewness, and helps to learn faster and better.

• If the given dataset is already in range [0.0, 1.0], you can skip the normalization.

• image = ( image - mean ) / std

• (mean, std) = (0.5, 0.5) ➜ image = ( image − 0.5 ) / 0.5

: (0.0, 1.0) range ➜ (−1.0, +1.0) range

• torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])

: Each 0.5 for each channel (Red, Green, Blue)

(66)

import numpy as np

import torch, torchvision

pixels = np.random.randint(low=0, high=256, size=(5, 5, 3)) # 5x5 RGB image

print(type(pixels)) # <class ‘numpy.ndarray'> print(np.min(pixels), ' ~ ', np.max(pixels)) # 0 ~ 255

pixels = pixels.astype('float32') / 255 # normalization: [0, 255] to [0.0, 1.0] print(type(pixels)) # <class ‘numpy.ndarray'>

print(np.min(pixels), ' ~ ', np.max(pixels)) # 0.0 ~ 1.0

image = torch.from_numpy(pixels) # to a tensor

print(type(image)) # <class ‘torch.Tensor'> print(torch.min(image).item(), ' ~ ', torch.max(image).item()) # 0.0 ~ 1.0

# transforms = torch vision.transforms.ToTensor()

transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

image = transforms(pixels) # apply transformations print(type(image)) # <class ‘torch.Tensor'> print(torch.min(image).item(), ' ~ ', torch.max(image).item()) # 0.0 ~ 1.0

transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor(),

torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) image = transforms(pixels) # apply transformations

print(type(image)) # <class ‘torch.Tensor'> print(torch.min(image).item(), ' ~ ', torch.max(image).item()) # -1.0 ~ 1.0

image = torchvision.transforms.functional.normalize(image, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))

(67)

MNIST

import numpy as np

import torch, torchvision

from matplotlib import pyplot as plt

transforms = torchvision.transforms.Compose([torchvision.transforms.ToTensor()]) bs = 64 # batch size

train_dataset = torchvision.datasets.MNIST(root='./data/', train=True, transform=transforms, download=True)

test_dataset = torchvision.datasets.MNIST(root='./data/', train=False, transform=transforms)

print(train_dataset.data.shape) # torch.Size([60000, 28, 28]) print(len(train_dataset), len(test_dataset)) # 60000 10000

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=bs, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=bs, shuffle=True)

print(train_dataloader.dataset.data.shape) # torch.Size([60000, 28, 28])

print(len(train_dataloader), len(test_dataloader)) # 938=round(60000/bs), 157=round(10000/bs) print(len(train_dataloader.dataset), len(test_dataloader.dataset)) # 60000 10000

(68)

MNIST

for batch_index, (images, labels) in enumerate(train_dataloader): print(labels.shape) # torch.Size([64])

print(images.shape) # torch.Size([64, 1, 28, 28]) print(images[0].shape) # torch.Size([1, 28, 28])

images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 28, 28, 1])

print(images[0].shape) # torch.Size([28, 28, 1])

plt.imshow(images[0].reshape((28, 28)), cmap=‘gray') plt.show()

break

for batch_index in range(len(train_dataset)): itr = iter(train_dataloader)

images, labels = itr.next()

print(labels.shape) # torch.Size([64])

images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 28, 28, 1]) print(images[0].shape) # torch.Size([28, 28, 1]) plt.imshow(images[0].reshape((28, 28)), cmap=‘gray') plt.show() break

2/2

(69)

(70)

CIFAR-10

import numpy as np

train_dataset = torchvision.datasets.CIFAR10(root='./data/CIFAR10', train=True, transform=transforms, download=True)

test_dataset = torchvision.datasets.CIFAR10(root='./data/CIFAR10', train=False, transform=transforms)

print(train_dataset.data.shape) # (50000, 32, 32, 3)

print(len(train_dataset), len(test_dataset)) # 50000 10000

print(train_dataloader.dataset.data.shape) # (50000, 32, 32, 3)

(71)

CIFAR-10

print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3)))

plt.show() break

images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 32, 32, 3]) print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3))) plt.show() break

2/2

(72)

(73)

CIFAR-100

import numpy as np

test_dataset = torchvision.datasets.CIFAR100(root=‘./data/CIFAR100', train=False, transform=transforms)

print(train_dataset.data.shape) # (50000, 32, 32, 3)

print(len(train_dataset), len(test_dataset)) # 50000 10000

print(train_dataloader.dataset.data.shape) # (50000, 32, 32, 3)

(74)

CIFAR-100

print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3)))

plt.show() break

images = np.transpose(images, (0,2,3,1)) # channel first order -> channel last order print(images.shape) # torch.Size([64, 32, 32, 3]) print(images[0].shape) # torch.Size([32, 32, 3]) plt.imshow(images[0].reshape((32, 32, 3))) plt.show() break

2/2

(75)

(76)

(77)

Accuracy Evaluation

import torch

data = torch.randn(3, 10) print(data.numpy())

-3.7282 -0.7150 -0.0891 -0.1889 -0.7913 -0.8726 -0.1097 1.9349 0.1923 -0.4194 -1.9478 0.7762 0.9239 0.3441 0.0412 0.5557 0.9953 2.2492 -0.4234 1.8282 0.1751 1.1866 0.7951 -0.8284 0.1223 -1.005 -1.3221 1.8195 -0.6615 1.9349 8

values

indices

2.2492 8 1.8282 0

max.

(80)

Accuracy Evaluation

import torch

values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())

accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1 1

a

b

0 1 1 1 0 1 1 1 0 1

(81)

Accuracy Evaluation

import torch

values, indices = torch.max(data.data, 1) print(values.numpy(), indices.numpy()) a = torch.tensor([1, 0, 1, 0, 1, 0]) b = torch.tensor([1, 1, 1, 1, 1, 1]) c = ( a == b ) print(c.numpy()) print(c.sum().item())

accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1 1

a

b

0 1 1 1 0 1 1 1 0 1 1

c

c.sum().item()

➜

3

0 1 0 1 0

(82)

Accuracy Evaluation

import torch

accuracy = 100 * c.sum().item() / len(c) print(accuracy, '%') 1 1

a

b

0 1 1 1 0 1 1 1 0 1 1

c

c.sum().item()

➜

3 accuracy

=

50.0 %

0 1 0 1 0

(83)

MNIST Classification

DEVICE = 'cuda' if torch.cuda.is_available() else ‘cpu' INPUT_DIM = 784 # = 28 x 28

HIDDEN_DIM = 100

OUTPUT_DIM = 10 # the number of classes TOTAL_EPOCHS = 10

LEARNING_RATE = 0.01 BATCH_SIZE = 2000

train_dataset = torchvision.datasets.MNIST(root='./data/', train=True, transform=transforms, download=True)

test_dataset = torchvision.datasets.MNIST(root='./data/', train=False, transform=transforms)

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True)

(84)

MNIST Classification

def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super(Model, self).__init__()

self.layer1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfn1 = torch.nn.ReLU()

self.layer2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def forward(self, x):

y1 = self.actfn1(self.layer1(x)) y2 = self.layer2(y1)

return y2

model = Model(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM).to(DEVICE)

CostFunc = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

(85)

MNIST Classification

for epoch in range(TOTAL_EPOCHS):

for images, labels in train_dataloader:

images = images.reshape(-1, 784).to(DEVICE) # flattening labels = labels.to(DEVICE)

output = model(images)

cost = CostFunc(output, labels) cost.backward()

optimizer.step()

print('Cost: {:.4f}’.format(cost.item()))

# for the test, you don't need to do the gradient computation. with torch.no_grad():

correct = 0

for images, labels in test_dataloader:

_, predicted = torch.max(output.data, 1)

correct += (predicted == labels).sum().item()

print('Accuracy: {} %'.format(100 * correct / 10000)) # Accuracy: 97.36 %

(86)

CIFAR-10 Classification

DEVICE = 'cuda' if torch.cuda.is_available() else ‘cpu' INPUT_DIM = 3072 # = 32 x 32 x 3

HIDDEN_DIM = 100

OUTPUT_DIM = 10 # the number of classes TOTAL_EPOCHS = 10

LEARNING_RATE = 0.01 BATCH_SIZE = 2000

test_dataset = torchvision.datasets.CIFAR10(root=‘./data/CIFAR10', train=False, transform=transforms)

train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True) test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, BATCH_SIZE=BATCH_SIZE, shuffle=True)

(87)

CIFAR-10 Classification

def __init__(self, INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM): super(Model, self).__init__()

self.layer1 = torch.nn.Linear(INPUT_DIM, HIDDEN_DIM) self.actfn1 = torch.nn.ReLU()

self.layer2 = torch.nn.Linear(HIDDEN_DIM, OUTPUT_DIM) def forward(self, x):

y1 = self.actfn1(self.layer1(x)) y2 = self.layer2(y1)

return y2

model = Model(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM).to(DEVICE)

CostFunc = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

(88)

CIFAR-10 Classification

for epoch in range(TOTAL_EPOCHS):

for images, labels in train_dataloader:

cost = CostFunc(output, labels) cost.backward()

optimizer.step()

print('Cost: {:.4f}’.format(cost.item()))

# for the test, you don't need to do the gradient computation. with torch.no_grad():

correct = 0

for images, labels in test_dataloader:

_, predicted = torch.max(output.data, 1)

correct += (predicted == labels).sum().item()

print('Accuracy: {} %'.format(100 * correct / 10000)) # Accuracy: 21.15 %

(89)

Problems

• Slow convergence

• Too low accuracy

• Flattened input data

• In this process, the spatial information is lost.

(90)

(91)

CNN: Convolutional Neural Network

input image

feature maps

_{(n channels)}

feature maps

_{(m channels)}

activation maps

_{(m channels)}

_{layer softmax}

FC

dog

cat

bird

!

convolution

with n kernels

pooling

convolution

with m kernels

pooling

activation maps

(n channels)

(92)

CNN for MNIST

class Model(torch.nn.Module): def __init__(self): super(Model, self).__init__() # (bs, 28, 28, 1) -> conv -> (bs, 28, 28, 6) -> pool -> (bs, 14, 14, 6) self.layer1 = torch.nn.Sequential(

torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2), torch.nn.ReLU(),

torch.nn.MaxPool2d(kernel_size=2, stride=2), torch.nn.Dropout(1 - keep_ratio))

# (bs, 14, 14, 6) -> conv -> (bs, 14, 14, 9) -> pool -> (bs, 7, 7, 9)

self.layer2 = torch.nn.Sequential(

(93)

CNN for MNIST

# FC: 7x7x9 -> 256 self.layer3 = torch.nn.Sequential( torch.nn.Linear(7*7*9, 256), torch.nn.ReLU()) # FC: 256 -> 128 self.layer4 = torch.nn.Sequential( torch.nn.Linear(256, 128), torch.nn.ReLU()) # FC: 128 -> 10 self.layer5 = torch.nn.Linear(128, 10) def forward(self, x): x = self.layer1(x) x = self.layer2(x) x = x.reshape(x.size(0), -1) # flattening: 7x7x9 -> 441(=7x7x9) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) return x

2/2

(94)

CNN for CIFAR-10

class Model(torch.nn.Module): def __init__(self): super(Model, self).__init__() # (bs, 32, 32, 3) -> conv -> (bs, 32, 32, 6) -> pool -> (bs, 16, 16, 6) self.layer1 = torch.nn.Sequential(

# (bs, 16, 16, 6) -> conv -> (bs, 16, 16, 9) -> pool -> (bs, 8, 8, 9)

self.layer2 = torch.nn.Sequential(

(95)

CNN for CIFAR-10

# FC: 8x8x9 -> 256 self.layer3 = torch.nn.Sequential( torch.nn.Linear(8*8*9, 256), torch.nn.ReLU()) # FC: 256 -> 128 self.layer4 = torch.nn.Sequential( torch.nn.Linear(256, 128), torch.nn.ReLU()) # FC: 128 -> 10 self.layer5 = torch.nn.Linear(128, 10) def forward(self, x): x = self.layer1(x) x = self.layer2(x) x = x.reshape(x.size(0), -1) # flattening: 7x7x9 -> 441(=7x7x9) x = self.layer3(x) x = self.layer4(x) x = self.layer5(x) return x

2/2

(96)

(97)

(98)

(99)

(100)