PyTorch LSTM未使用隐层
我在使用PyTorch的LSTM接口,但遇到了一些问题。我用LSTM来做一个简单的AI模型。这个模型的任务是:如果前一个数字小于当前数字,就返回1。
比如对于一个数组[0.7, 0.3, 0.9, 0.99]
,期望的输出是[1.0, 0.0, 1.0, 1.0]
。第一个输出应该无论如何都是1.0
。
为了解决这个问题,我设计了以下的网络:
# network.py
import torch
N_INPUT = 1
N_STACKS = 1
N_HIDDEN = 3
LR = 0.001
class Network(torch.nn.Module):
# params: self
def __init__(self):
super(Network, self).__init__()
self.lstm = torch.nn.LSTM(
input_size=N_INPUT,
hidden_size=N_HIDDEN,
num_layers=N_STACKS,
)
self.linear = torch.nn.Linear(N_HIDDEN, 1)
self.relu = torch.nn.ReLU()
self.optim = torch.optim.Adam(self.parameters(), lr=LR)
self.loss = torch.nn.MSELoss()
# params: self, predicted, expecteds
def backprop(self, xs, es):
# perform backprop
self.optim.zero_grad()
l = self.loss(xs, torch.tensor(es))
l.backward()
self.optim.step()
return l
# params: self, data (as a python array)
def forward(self, dat):
out, _ = self.lstm(torch.tensor(dat))
out = self.relu(out)
out = self.linear(out)
return out
我从这个文件中调用这个网络:
# main.py
import network
import numpy as np
# create a new network
n: network.Network = network.Network()
# create some data
def rand_array():
# a bunch of random numbers
a = [[np.random.uniform(0, 1)] for i in range(1000)]
# now, our expected value is 0 if the previous number is greater, and 1 else
expected = [0.0 if a[i - 1][0] > a[i][0] else 1.0 for i in range(len(a))]
expected[0] = 1.0 # make the first element always just 1.0
return [a, expected]
# a bunch of random arrays
data = [rand_array() for i in range(1000)]
# 100 epochs
for i in range(100):
for i in data:
pred = n(i[0])
loss = n.backprop(pred, i[1])
print("Loss: {:.5f}".format(loss))
现在,当我运行这个程序时,损失值大约在0.25
左右,而且一旦到达这个值就不再变化。我觉得模型只是对每个输入取了0
和1
的平均值(0.5
)。
这让我觉得模型无法看到之前的数据;这些数据只是随机数字(不过期望的输出是基于这些随机数字的),而模型无法记住之前发生了什么。
我遇到的问题是什么呢?
1 个回答
1
我没有看到隐藏状态的问题。你使用的设置有点奇怪,可能会影响学习效果。
首先,我们需要一个合适的数据集和数据加载器。
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn.functional as F
class RandDataset(Dataset):
def __init__(self, sequence_length):
self.sequence_length = sequence_length
def __len__(self):
return 10000 # data is generated so length is arbitrary
def __getitem__(self, idx):
sequence = torch.rand(self.sequence_length)
labels = torch.ones_like(sequence)
labels[1:] = sequence[:-1] < sequence[1:]
sequence = sequence[None,:,None] # shape (1, sequence_length, 1)
labels = labels[None,:] # shape (1, sequence_length)
return sequence, labels
def collate_fn(batch):
sequences = torch.cat([i[0] for i in batch])
labels = torch.cat([i[1] for i in batch])
return sequences, labels
dataset = RandDataset(1000)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn)
接下来是模型。添加一个输入投影层,去掉LSTM模块后不必要的relu激活函数。
class LSTMModel(nn.Module):
def __init__(self, d_in, d_proj, d_hidden, n_layers):
super().__init__()
self.input_layer = nn.Linear(d_in, d_proj)
self.lstm = nn.LSTM(input_size=d_proj, hidden_size=d_hidden, num_layers=n_layers, batch_first=True)
self.output_layer = nn.Linear(d_hidden, 1)
def forward(self, x):
x = self.input_layer(x)
x = F.relu(x)
x, _ = self.lstm(x)
x = self.output_layer(x)
return x
model = LSTMModel(1, 32, 64, 2)
现在我们开始训练。我们要预测的是二分类的结果,所以我们应该使用 BCEWithLogitsLoss
,而不是 MSELoss
。用MSE来做分类预测是没有意义的。
device = 'cuda'
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_function = nn.BCEWithLogitsLoss()
epochs = 1
model.to(device);
for epoch in range(epochs):
for i, batch in enumerate(dataloader):
seqs, labs = batch
seqs = seqs.to(device)
labs = labs.to(device)
preds = model(seqs)
loss = loss_function(preds.reshape(-1), labs.reshape(-1))
if i%10==0:
print(f'{loss.item():.3f}')
opt.zero_grad()
loss.backward()
opt.step()
训练完成后,测试一下性能。
model.eval()
seq, lab = dataset[0]
pred = model(seq.to(device)).cpu()
pred = (torch.sigmoid(pred)>0.5).float().squeeze()
lab = lab.squeeze()
acc = (pred == lab).float().mean()
print(acc)
> tensor(0.9990)