我正在imdb
dataset上训练一个基于BERT的模型。但是,我无法迭代Pytorch数据加载器
这是完全可复制的代码。培训文件为上述csv
import transformers
from sklearn import model_selection
import torch
import pandas as pd
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-cased", do_lower_case=True)
max_len = 512
train_batch_size = 8
"""
This class takes reviews and targets as arguments
- Split the reviews and tokenizes
"""
class BERTDataset:
def __init__(self, review, target):
self.review = review
self.target = target
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.review)
def __getitem__(self, item):
review = str(self.review[item])
review = " ".join(review.split())
tokenized_inputs = self.tokenizer.encode_plus(
review,
None,
add_special_tokens=True,
max_length=self.max_len,
padding=True,
truncation=True
)
ids = tokenized_inputs["input_ids"]
mask = tokenized_inputs["attention_mask"]
token_type_ids = tokenized_inputs["token_type_ids"]
return {
"ids": torch.tensor(ids, dtype=torch.long),
"mask": torch.tensor(mask, dtype=torch.long),
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
"targets": torch.tensor(self.target[item], dtype=torch.float),
}
dfx = pd.read_csv(training_file).fillna("none")
dfx['sentiment'] = dfx['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df_train, df_valid = model_selection.train_test_split(
dfx,
test_size=0.1,
random_state=42,
stratify=dfx['sentiment'].values
)
# reset indices
df_train = df_train.reset_index(drop=True)
# get ids, tokens, masks and targets
train_dataset = BERTDataset(review=df_train['review'], target=df_train['sentiment'])
# load into pytorch dataset object
# DataLoader inputs tensor dataset of Inputs and targets
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, num_workers=0)
# Iterating to the Data loader
train_iter = iter(train_data_loader)
print(type(train_iter))
review, labels = train_iter.next()
出现以下错误
RuntimeError Traceback (most recent call last)
<ipython-input-19-c99d0829d5d9> in <module>()
2 print(type(train_iter))
3
----> 4 images, labels = train_iter.next()
5 frames
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
53 storage = elem.storage()._new_shared(numel)
54 out = elem.new(storage)
---> 55 return torch.stack(batch, 0, out=out)
56 elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
57 and elem_type.__name__ != 'string_':
RuntimeError: stack expects each tensor to be equal size, but got [486] at entry 0 and [211] at entry 1
数据加载器中的num_workers=0修复了内存问题。有人能帮我弄清楚我遗漏了什么吗
目前没有回答
相关问题 更多 >
编程相关推荐