1 Pytorch brief

1.1 Create Tensor

empty

1 2	import torch x1 = torch.empty(5, 3) # 构造未初始化的矩阵

rand

1	x2 = torch.rand(5, 3) # 构造随机初始化的正态分布矩阵

randn

1	a = torch.randn(3, 3) # [0,1]之间标准正态分布

arange

1	a = torch.arange(0, 10, 2) # 0-10, steps is 2

linspace

1	a = torch.linspace(0, 10, steps=5) # 0到10，分5份

zeros & ones & eye

x3 = torch.zeros(5, 3) # 类型为 float32
x4 = torch.zeros(5, 3, dtype=torch.long)

a = torch.ones(3, 3)  # 3*3,全1矩阵

a = torch.eye(3, 3)  # 3*3,对角矩阵

full

1	a = torch.full([2, 3], 7) # 2行3列，全是7

From datum

1
2
3

x5 = torch.tensor([5.4, 3]) # 从数据创建
x6 = x5.new_ones(5, 3) # 同x5一样的数据类型
x7 = torch.randn_like(x6, dtype = torch.float) # 创建同x6一样大小的正态 Tensor

From NumPy

a = np.array([2, 5.5])
print(a)
b = torch.from_numpy(a)
print(b)

1.2 Operations

1 2	x = torch.rand(5, 3) y = torch.rand(5, 3)

1.2.1 Attributions

Type

# 查看 x 的类型
xType1 = x.type()
xType2 = x.dtype
print('first:{}, second: {}'.format(xType1, xType2))

Shape

1
2
3

# 查看 Tensor 的形状
print(x.size())
print(x.shape)

1.2.2 Numerical operations

Add
1
2
3
4
5
# 加法
x + y
x/y # 除法
torch.add(x, y, [out = result])
y.add_(x)
in-place 的运算都会以 _ 结尾。举例来说：x.copy_(y), x.t_(), 该操作会改变 x。

Metric multiple

a = torch.ones(2, 2) * 2
b = torch.ones(2, 3)
print(torch.mm(a, b))  # 只适用于2维数组
print(a@b)
a = torch.rand(4, 32, 28, 28)
b = torch.rand(4, 32, 28, 16)
print(torch.matmul(a, b).shape)  # 可以适用于多维数组，只将最后两个维度相乘

mm 只适用于2 维数组的矩阵相乘，matmul 可以适用于多维数组，只将最后两个维度相乘。Result:

tensor([[4., 4., 4.],
        [4., 4., 4.]])
tensor([[4., 4., 4.],
        [4., 4., 4.]])
torch.Size([4, 32, 28, 16])

pow & sqrt

a = torch.ones(2, 2) * 2
# Pow
a.pow(2)
a**2

# sqrt
a.sqrt()
a**2

Result:

tensor([[4., 4.],
        [4., 4.]])
tensor([[4., 4.],
        [4., 4.]])
tensor([[1.4142, 1.4142],
        [1.4142, 1.4142]])
tensor([[1.4142, 1.4142],
        [1.4142, 1.4142]])

exp & log


# Exp
a = torch.exp(torch.ones(2, 2))  # e运算
print(a)

# Log
print(torch.log(a))  # 取对数，默认以e为底

Round

a = torch.tensor(3.14)
print(a.floor())  # 向下取整
print(a.ceil())  # 向上取整
print(a.trunc())  # 取整数部分
print(a.frac())  # 取小数部分

floor：向下取整;
ceil：向上取整;
traunc：取整数部分：
frac，取小数部分。

clamp

clamp 可以用来限定数组的范围。

a = torch.rand(2, 3) * 15
print(a)
print(a.clamp(2))  # 限定最小值为2
print(a.clamp(2, 10))  # 取值范围在0-10

Result:

tensor([[ 0.7791,  4.7365,  4.2215],
        [12.7793, 11.7283, 13.1722]])
tensor([[ 2.0000,  4.7365,  4.2215],
        [12.7793, 11.7283, 13.1722]])
tensor([[ 2.0000,  4.7365,  4.2215],
        [10.0000, 10.0000, 10.0000]])

transpose

1
2
3

# 转置
y = torch.randn(2, 4)
z = y.transpose(1, 0)

Switch to numpy

# numpy 与 tensor 转换
a = torch.ones(4)
b = a.numpy() # tensor 转 numpy
c = np.ones(5)
c = torch.from_numpy(c) # numpy 转 tensor
b[2] = 3 # 浅拷贝，都会改变
# 所有CPU上的Tensor都支持转成numpy或从numpy转成Tensor

Get values

1	x.item() # 将单元素的 tensor 转化为 Python 数值

1.2.3 Dimensional operation

Truncate
1
2
# 截取
print(x[:, 1])

Concatenation: cat

a = torch.rand(4, 32, 8)
b = torch.rand(5, 32, 8)
c = torch.cat([a, b], dim=0)
print(c.shape)

按第 0 维度进行拼接，除拼接之外的维度必须相同。Result:

1	torch.Size([9, 32, 8])

stack

可以利用 stack 拼接，和 cat 命令不同，该命令会产生一个新的维度。
1
2
3
4
a = torch.rand(5, 32, 8)
b = torch.rand(5, 32, 8)
c = torch.stack([a, b], dim=0)
print(c.shape)
产生一个新的维度，待拼接的向量维度相同。result:
1
torch.Size([2, 5, 32, 8])

split

按所制定的长度对张量进行拆分。

a = torch.rand(6, 32, 8)
b, c = a.split(3, dim=0)  # 所给的是拆分后，每个向量的大小，指定拆分维度
print(b.shape)
print(c.shape)

split 所给的是拆分后，每个向量的大小，指定拆分维度。Result:

1 2	torch.Size([3, 32, 8]) torch.Size([3, 32, 8])

chuck

按所给数量进行拆分。

a = torch.rand(6, 32, 8)
b, c, d = a.chuck(3, dim=0)  # 所给的是拆分的个数，即拆分成多少个小，指定拆分维度
print(b.shape)
print(c.shape)

所给的是拆分的个数，即拆分成多少个。Result:

1 2	torch.Size([2, 32, 8]) torch.Size([2, 32, 8])

reshape

x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions，自动计算
print(x.size(), y.size(), z.size())

1.3 GPU edition

1.3.1 Basic functions

is_available

1	torch.cuda.is_avaliable() # 判断 CUDA 是否可用

device

1	device = torch.device('cuda') # 将 torch 对象放入 GPU 中

Transfer tensor into CUDA

x = torch.randn(4, 4)
y = torch.ones_like(x, device=device)
x = x.to(device)  
# or just use strings ``.to("cuda")`

z = x + y
print(z)
print(z.to("cpu", torch.double))
# 先转到 cpu 中才能转numpy，因为 numpy 是在cpu上运行的

to can also change dtype together, such as z.cuda() or z.to('cuda:0'). Before turn into numpy, the data must be on cpu，becasue numpy run on cpu.

Full code

if torch.cuda.is_available():
    device = torch.device("cuda")
  # a CUDA device object
    x = torch.randn(4, 4)
    y = torch.ones_like(x, device=device)  
  # directly create a tensor on GPU
    x = x.to(device)                       
  # or just use strings ``.to("cuda")``
    z = x + y
    print(z)
    print(z.to("cpu", torch.double))       
  # ``.to`` can also change dtype together! such as z.cuda() or z.to('cuda:0')

y.to('cpu').data.numpy()
y.cpu().data.numpy() # 先转到 cpu 中才能转numpy，因为 numpy 是在cpu上运行的

1.4 Gradient

import torch
x = torch.tensor(1., requires_grad = True)
w = torch.tensor(2., requires_grad = True)
b = torch.tensor(3., requires_grad = True)

y = w*x + b

y.backword(x.grad, w.grad, b.grad)

eg: Nural network

import numpy as np
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
# 创建随机的Tensor来保存输入和输出
# 设定requires_grad=False表示在反向传播的时候我们不需要计算gradient
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)
# print (x)

# Randomly initialize weights
# 创建随机的Tensor和权重。
# 设置requires_grad=True表示我们希望反向传播的时候计算Tensor的gradient
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad = True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad = True)

learning_rate = 1e-6
for it in range(500):
    # Forward pass: compute predicted y
    # 前向传播:通过Tensor预测y；这个和普通的神经网络的前向传播没有任何不同，
    # 但是我们不需要保存网络的中间运算结果，因为我们不需要手动计算反向传播。
	pred = x.mm(w1).clamp(min = 0).mm(w2)

    # Compute loss
    # 通过前向传播计算loss
    # loss是一个形状为(1，)的Tensor
    # loss.item()可以给我们返回一个loss的scalar
    loss = (y_pred - y).pow(2).sum()
    print(it, loss.item())

    # Backward pass
    # PyTorch给我们提供了autograd的方法做反向传播。如果一个Tensor的requires_grad=True，
    # backward会自动计算loss相对于每个Tensor的gradient。在backward之后，
    # w1.grad和w2.grad会包含两个loss相对于两个Tensor的gradient信息。
    loss.backward()

    # not remember gradient data
    with torch.no_grad():
        # Update weights of w1 and w2
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

1.5 Pytorch methods

1.5.1 Pytorch: NN

import torch
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # y = w_1 * x + b_1
    torch.nn.ReLU(), # a = max(0, h)
    torch.nn.Linear(H, D_out), # y_hat = w_2 * a + b_2
)

model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')

learning_rate = 1e-4

for it in range(1000):
    y_pred = model(x.cuda())

    # compute loss
    loss = loss_fn(y_pred.cuda(), y.cuda())
    print(it, loss.cpu().item())

    model.zero_grad()

    loss.backward()

    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

1.5.2 Pytorch: optim

import torch
import torch.nn as nn

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H), # y = w_1 * x + b_1
    torch.nn.ReLU(), # a = max(0, h)
    torch.nn.Linear(H, D_out), # y_hat = w_2 * a + b_2
)

model = model.cuda()

loss_fn = nn.MSELoss(reduction = 'sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

for it in range(1000):
    y_pred = model(x.cuda())

    # compute loss
    loss = loss_fn(y_pred.cuda(), y.cuda())
    print(it, loss.cpu().item())

    model.zero_grad()   
    loss.backward()
    optimizer.step()

1.6 Variable

import torch
from torch.autograd import Variable

tensor = torch.FloatTensor([[1, 2], [3, 4]])
variable = Variable(tensor, requires_grad = True)

t_out = torch.mean(tensor * tensor)
v_out = torch.mean(variable * variable)

v_out.backward()

print(t_out, v_out, variable.grad, sep='\n')
variable_value = variable.data #output tensor
variable_numpy = variable.data.numpy()
# can't finish above function with command: variable.numpy(), as variable isn't tensor

2 NN

2.1 快速实现神经网络

import torch
import torch.nn.functional as F

# method 1
class Net(torch.nn.module):
    def __init__(self, n_feature, n_output):
        super(Net, self).__init()
        self.hidden = torch.nn.Linear(n_feature, n_hidden)
        self.predict = torch.nn.Linear(n_hidden, n_output)

    def forward(self, x):
        x = F.relu(self.hiddec(x))
        x = self.predic(x)
        return x

net1 = Net(1, 10, 1)

# method 2
net2 = torch.nn.Sequential(
	torch.nn.Linear(1, 10)
	torch.nn.ReLU()
	torch.nn.Linear(10, 1)
)

print(net1, net2, sep = '\n')

result:

Net (
  (hidden): Linear (1 -> 10)
  (predict): Linear (10 -> 1)
)
Sequential (
  (0): Linear (1 -> 10)
  (1): ReLU ()
  (2): Linear (10 -> 1)
)

2.2 load and save nn

Import modules

import torch
from torch.autograd import Variable
import matplotlib.pyplot as plt

torch.manual_seed(1)    # reproducible

Generate fake data

x = torch.unsqueeze(torch.linspace(-1, 1, 100), dim=1)  
# 纵向挤压，x data (tensor), shape=(100, 1)
y = x.pow(2) + 0.2*torch.rand(x.size())  
# noisy y data (tensor), shape=(100, 1)
x, y = Variable(x, requires_grad=False), Variable(y, requires_grad=False)

Save net

def save():
    # save net1
    net1 = torch.nn.Sequential(
        torch.nn.Linear(1, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 1)
    )
    optimizer = torch.optim.SGD(net1.parameters(), lr=0.5)
    loss_func = torch.nn.MSELoss()

    for t in range(100):
        prediction = net1(x)
        loss = loss_func(prediction, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # plot result
    plt.figure(1, figsize=(10, 3))
    plt.subplot(131)
    plt.title('Net1')
    plt.scatter(x.data.numpy(), y.data.numpy())
    plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)

    # 2 ways to save the net
    torch.save(net1, 'net.pkl')  # save entire net
    torch.save(net1.state_dict(), 'net_params.pkl')   # save only the parameters

Reload net

def restore_net():
    # restore entire net1 to net2
    net2 = torch.load('net.pkl')
    prediction = net2(x)

    # plot result
    plt.subplot(132)
    plt.title('Net2')
    plt.scatter(x.data.numpy(), y.data.numpy())
    plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)

Reload net parameters

def restore_params():
    # restore only the parameters in net1 to net3
    net3 = torch.nn.Sequential(
        torch.nn.Linear(1, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 1)
    )

    # copy net1's parameters into net3
    net3.load_state_dict(torch.load('net_params.pkl'))
    prediction = net3(x)

    # plot result
    plt.subplot(133)
    plt.title('Net3')
    plt.scatter(x.data.numpy(), y.data.numpy())
    plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)
    plt.show()

Test

# save net1
save()
# restore entire net (may slow)
restore_net()
# restore only the net parameters
restore_params()

result

fig 1-1 save net

2.3 批训练

import torch
import torch.utils.data as Data

torch.manual_seed(1)

BATCH_SIZE = 5

x = torch.linspace(1, 10, 10)       # this is x data (torch tensor)
y = torch.linspace(10, 1, 10)       # this is y data (torch tensor)

torch_dataset = Data.TensorDataset(x, y)
loader = Data.DataLoader(
    dataset=torch_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # 打乱数据次序
    num_workers=2,              # subprocesses for loading data
)

for epoch in range(3):   # train entire dataset 3 times
    for step, (batch_x, batch_y) in enumerate(loader):  
        # train your data...
        print('Epoch: ', epoch, '| Step: ', step, '| batch x: ',batch_x.numpy(), '| batch y: ', batch_y.numpy())

2.4 Optimizer

2.4.1 Optimization methods:

Newton’s method（牛顿法）
Least Squares method（最小二乘法）
Gradient Descent（梯度下降法：神经网络）

2.4.2 Gradient Descent:

Cost Function（误差方程）:

\begin{align*} Cost = (predicted - real)^2 &= (Wx - y)^2 \\ &=(W - o)^2 \end{align*}

2.4.3 Code

import torch
import torch.utils.data as Data
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
%matplotlib inline

torch.manual_seed(1)    # reproducible

# hyper parameters
LR = 0.01
BATCH_SIZE = 32
EPOCH = 12

# generate datas
x = torch.unsqueeze(torch.linspace(-1, 1, 1000), dim=1)
y = x.pow(2) + 0.1*torch.normal(torch.zeros(*x.size()))
# unsqueeze: Returns a new tensor with a dimension of size one inserted at the specified position.

# plot dataset
plt.scatter(x.numpy(), y.numpy())
plt.show()

# batch training
torch_dataset = Data.TensorDataset(x, y)
loader = Data.DataLoader(
    dataset=torch_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True, num_workers=2,)

# define neural network
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(1, 20)  
        self.predict = torch.nn.Linear(20, 1)

    def forward(self, x):
        x = F.relu(self.hidden(x)) # activation function for hidden layer
        x = self.predict(x) # linear output
        return x

# different optimizer
net_SGD         = Net()
net_Momentum    = Net()
net_RMSprop     = Net()
net_Adam        = Net()
nets = [net_SGD, net_Momentum, net_RMSprop, net_Adam]

opt_SGD         = torch.optim.SGD(net_SGD.parameters(), lr=LR)
opt_Momentum    = torch.optim.SGD(net_Momentum.parameters(), lr=LR, momentum=0.8)
opt_RMSprop     = torch.optim.RMSprop(net_RMSprop.parameters(), lr=LR, alpha=0.9)
opt_Adam        = torch.optim.Adam(net_Adam.parameters(), lr=LR, betas=(0.9, 0.99))

optimizers = [opt_SGD, opt_Momentum, opt_RMSprop, opt_Adam]

# loss function
loss_func = torch.nn.MSELoss()
losses_his = [[], [], [], []]   # record loss

# training
# training
for epoch in range(EPOCH):
    print('Epoch: ', epoch)
    for step, (batch_x, batch_y) in enumerate(loader):          # for each training step
        b_x = Variable(batch_x)
        b_y = Variable(batch_y)

        for net, opt, l_his in zip(nets, optimizers, losses_his):
            output = net(b_x)              # get output for every net
            loss = loss_func(output, b_y)  # compute loss for every net
            opt.zero_grad()                # clear gradients for next train
            loss.backward()                # backpropagation, compute gradients
            opt.step()                     # apply gradients
            l_his.append(loss.item())     # loss recoder

labels = ['SGD', 'Momentum', 'RMSprop', 'Adam']
for i, l_his in enumerate(losses_his):
    plt.plot(l_his, label=labels[i])
plt.legend(loc='best')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.ylim((0, 0.2))
plt.show()

result:

fig 1-2 Different optimizer

2.5 Activation Function

import torch
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt

x = torch.linspace(-5, 5, 200)
x = Variable(x)
x_np = x.data.numpy()

# four activation function
y_relu = F.relu(x).data.numpy()
y_sigmoid = torch.sigmoid(x).data.numpy()
y_tanh = F.tanh(x).data.numpy()
y_softplus = F.softplus(x).data.numpy()

plt.figure(1, figsize=(8, 6))
plt.subplot(221)
plt.plot(x_np, y_relu, c='red', label='relu')
plt.ylim((-1, 5))
plt.legend(loc='best')

plt.subplot(222)
plt.plot(x_np, y_sigmoid, c='red', label='sigmoid')
plt.ylim((-0.2, 1.2))
plt.legend(loc='best')

plt.subplot(223)
plt.plot(x_np, y_tanh, c='red', label='tanh')
plt.ylim((-1.2, 1.2))
plt.legend(loc='best')

plt.subplot(224)
plt.plot(x_np, y_softplus, c='red', label='softplus')
plt.ylim((-0.2, 6))
plt.legend(loc='best')

plt.show()

![img](D:\Demo\Markdown\Pytorch.assets\activation function.png)

fig.4-1 four activation function

$sigmoid$ 激活函数：

\sigma(x) = \frac{1}{1+e^{-x}}

$tanh$ 激活函数：

tanh(x) = 2 \sigma(2x) - 1

$ReLU$ 激活函数：

ReLU(x) = max(0, x)

$Softmax$ 激活函数：

z_i \rightarrow \frac{e^{z_i}}{\sum_{j=1}^{k} e^{z_j}}

3 Examples for NN

3.1 Numpy

一个全连接ReLU神经网络，一个隐藏层，没有bias。用来从x预测y，使用L2 Loss。

$ h = W_1 X + b_1$
$ a = max(0, h)$
$ y_{hat} = W_2 a + b_2$
$ loss = (y_{hat} - y) ** 2$

Goals: 把 1000 维的向量转化维 10 维的向量

import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
# print (x)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.rand(H, D_out)

learning_rate = 1e-6
for it in range(1000):
    # Forward pass: compute predicted y
    h = x.dot(w1) # 64 x 1000 array multiply 1000 x 100, hide layer turn 1000D data into 10D，N * H array
    h_relu = np.maximum(h, 0) # activate function，N * H
    y_pred = h_relu.dot(w2) # N * D_out

    # Compute loss
    # loss = (y_pred - y) ** 2
    loss = np.square(y_pred - y).sum()
    print(it, loss)

    # Backward pass
    # Backprop to compute gradients of w1 and w2 with respect to loss
    # Compute the gradient based on loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred) # h _relu: N * H, grad_y_pred: N * D_out
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

3.2 Torch

Modify according to above codes

import numpy as np
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# print (x)

# Randomly initialize weights
w1 = torch.randn(D_in, H)
w2 = torch.rand(H, D_out)

learning_rate = 1e-6
for it in range(1000):
    # Forward pass: compute predicted y
    h = x.mm(w1) # 64 x 1000 array multiply 1000 x 100, hide layer turn 1000D data into 10D，N * H array
    h_relu = h.clamp(min = 0) # activate function，N * H
    y_pred = h_relu.mm(w2) # N * D_out

    # Compute loss
    # loss = (y_pred - y) ** 2
    loss = (y_pred - y).pow(2).sum().item()
# tensor.item()： turn tensor into value
    print(it, loss)

    # Backward pass
    # Backprop to compute gradients of w1 and w2 with respect to loss
    # Compute the gradient based on loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred) # h _relu: N * H, grad_y_pred: N * D_out
    grad_h_relu = grad_y_pred.mm(w2.T)
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights of w1 and w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

3.3 fizz_buzz demo

FizzBuzz是一个简单的小游戏。游戏规则如下：从1开始往上数数，当遇到3的倍数的时候，说fizz，当遇到5的倍数，说buzz，当遇到15的倍数，就说fizzbuzz，其他情况下则正常数数。

可以写一个简单的小程序来决定要返回正常数值还是fizz, buzz 或者 fizzbuzz

import numpy as np
import torch

def fizz_buzz_encode(i):
    if i % 15 == 0:        return 3
    elif i % 5 == 0:       return 2
    elif i % 3  == 0:      return 1
    else:        return 0

def fizz_buzz_decode(i, prediction):
    return [str(i), 'fizz', 'buzz', 'fizzbuzz'][prediction]

# for i in range(1, 15):
#     print(fizz_buzz_decode(i, fizz_buzz_encode(i)))

定义模型的训练数据，并传入 GPU 中

# hyper parameters
NUM_DIGITS = 10
NUM_HIDDEN = 100

# def binary_encode(i, num_digits):
#     return np.array([i >> d & 1 for d in range(num_digits)])

def binary_encode(i, num_digits):
    binary = list()
    for i in bin(i)[2:]:
        binary.append(int(i))
    size = len(binary)
    for j in [0] * (num_digits - size):
        binary.insert(0, int(j))
    return np.array(binary)

# trX = torch.Tensor([binary_encode(i, NUM_DIGITS)  for i in range(101, 2 ** NUM_DIGITS)])
# trY = torch.LongTensor([fizz_buzz_encode(i) for i in range(101, 2 ** NUM_DIGITS)])

trX = torch.Tensor([binary_encode(i, NUM_DIGITS) for i in range(101, 2 ** NUM_DIGITS)])
trY = torch.LongTensor([fizz_buzz_encode(i) for i in range(101, 2**NUM_DIGITS)])

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

trX.to(device)
trY.to(device)

定义神经网络模型

model = torch.nn.Sequential(
    torch.nn.Linear(NUM_DIGITS, NUM_HIDDEN),
    torch.nn.ReLU(),
    torch.nn.Linear(NUM_HIDDEN, 4)
    )

为了让我们的模型学会FizzBuzz这个游戏，我们需要定义一个损失函数，和一个优化算法。
这个优化算法会不断优化（降低）损失函数，使得模型的在该任务上取得尽可能低的损失值。
损失值低往往表示我们的模型表现好，损失值高表示我们的模型表现差。
由于FizzBuzz游戏本质上是一个分类问题，我们选用Cross Entropyy Loss函数。
优化函数我们选用Stochastic Gradient Descent。

1
2
3

# loss function
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 5e-2)

训练模型

BATCH_SIZE = 128
for epoch in range(10000):
    for start in range(0, len(trX), BATCH_SIZE):
        end = start + BATCH_SIZE
        batchX = trX[start:end]
        batchY = trY[start:end]

        y_pred = model(batchX)
        loss = loss_fn(y_pred, batchY)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    loss = loss_fn(model(trX), trY).item()
    print('Epoch:', epoch, 'Loss:', loss, sep = '\t')

最后用训练好的模型尝试在 1-100 中玩 FizzBuzz 邮箱

testX = torch.Tensor([binary_encode(i, NUM_DIGITS) for i in range(1, 101)])
with torch.no_grad():
    testY = model(testX)
predictions = zip(range(1, 101), list(testY.max(1)[1].data.tolist()))

# Ouptut prediction
print([fizz_buzz_decode(i, x) for (i, x) in predictions])

# Output False when prediction is imprecise
print(np.sum(testY.max(1)[1].numpy() == np.array([fizz_buzz_encode(i) for i in range(1,101)])))

3.3.1 Full code

import numpy as np
import torch

def fizz_buzz_encode(i):
    if i % 15 == 0:        return 3
    elif i % 5 == 0:       return 2
    elif i % 3  == 0:      return 1
    else:        return 0

def fizz_buzz_decode(i, prediction):
    return [str(i), 'fizz', 'buzz', 'fizzbuzz'][prediction]

# for i in range(1, 15):
#     print(fizz_buzz_decode(i, fizz_buzz_encode(i)))

# hyper parameters
NUM_DIGITS = 10
NUM_HIDDEN = 100

# def binary_encode(i, num_digits):
#     return np.array([i >> d & 1 for d in range(num_digits)])

def binary_encode(i, num_digits):
    binary = list()
    for i in bin(i)[2:]:
        binary.append(int(i))
    size = len(binary)
    for j in [0] * (num_digits - size):
        binary.insert(0, int(j))
    # bin_array = np.array(binary) # array
    return np.array(binary)

# trX = torch.Tensor([binary_encode(i, NUM_DIGITS)  for i in range(101, 2 ** NUM_DIGITS)])
# trY = torch.LongTensor([fizz_buzz_encode(i) for i in range(101, 2 ** NUM_DIGITS)])

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

trX = torch.Tensor([binary_encode(i, NUM_DIGITS) for i in range(101, 2 ** NUM_DIGITS)])
trY = torch.LongTensor([fizz_buzz_encode(i) for i in range(101, 2**NUM_DIGITS)])
trX.to(device)
trY.to(device)

model = torch.nn.Sequential(
    torch.nn.Linear(NUM_DIGITS, NUM_HIDDEN),
    torch.nn.ReLU(),
    torch.nn.Linear(NUM_HIDDEN, 4)
    )

# loss function
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 5e-2)

BATCH_SIZE = 128
for epoch in range(10000):
    for start in range(0, len(trX), BATCH_SIZE):
        end = start + BATCH_SIZE
        batchX = trX[start:end]
        batchY = trY[start:end]

        y_pred = model(batchX)
        loss = loss_fn(y_pred, batchY)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    loss = loss_fn(model(trX), trY).item()
    print('Epoch:', epoch, 'Loss:', loss, sep = '\t')

testX = torch.Tensor([binary_encode(i, NUM_DIGITS) for i in range(1, 101)])
with torch.no_grad():
    testY = model(testX)
predictions = zip(range(1, 101), list(testY.max(1)[1].data.tolist()))

print([fizz_buzz_decode(i, x) for (i, x) in predictions])
print(np.sum(testY.max(1)[1].numpy() == np.array([fizz_buzz_encode(i) for i in range(1,101)])))