我正在使用BERT和Tensorflow 2.*在Python中创建一个多类分类器,但是我在当前代码中遇到了两个错误,我整天都在使用它们。我遇到的第一个错误是:
TypeError: Could not build a TypeSpec for [[<encoded text>], [...]] with type list
其中[<encoded text>]
看起来像:
[101, 1997, 5142, 2000, 1996, 5438, 10994, 2003, 3465, 1997, 5438, 2004, 2092, 8521, 2011, 1011, 4031, 2030, 2512, 1011, 3151, 14172, 2029, 2097, 4254, 14266, 1998, 2740, 102]
我得到的第二个错误是:
During handling of the above exception, another exception occurred:
ValueError: Can't convert non-rectangular Python sequence to Tensor.
对于第二个错误,我正在考虑使用参差不齐的张量,因此对此的任何帮助都将不胜感激。然而,由于我花了这么长时间,我主要集中在找出第一个错误上
我的错误来自encode_examples
函数,我的脚本是:
import csv
import pandas as pd
import tensorflow_datasets as tfds
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
# Hyperparameters
BATCH_SIZE = 1
LEARNING_RATE = 2e-5 # recommended learning rate for Adam 5e-5, 3e-5, 2e-5
NUMBER_OF_EPOCHS = 1
tf.compat.v1.enable_eager_execution()
def get_input_data(file_name, is_labelled=False):
"""
Returns file data
:param file_name: name of file to be read
:param is_labelled: True if file name is labelled
:return: input_list: list containing file data
"""
input_list = []
if is_labelled:
input_list = {}
with open(file_name, encoding='utf-8') as f:
if is_labelled:
file_list = list(f)
label = file_list[0].rstrip()
sentences = file_list[1:]
for sentence in sentences:
input_list[sentence] = label
else:
rows = iter(csv.reader(f))
next(rows)
for row in rows:
if len(row) < 2:
continue
input_list.append([e.replace("_", " ") for e in row])
return input_list
def combine_files(*args) -> dict:
"""
Combines files and returns list of their data
:return: combined_inputs List of input data from files
"""
combined_data = {}
for file in args:
combined_data.update(get_input_data(file, True))
return combined_data
def encode_data(text_dict: dict) -> dict:
"""
Converts labels into integers
:param text_dict: mapping of text to label
:return: mapping of text to integer label
"""
for key in text_dict.keys():
val = text_dict[key]
if val == 'negative':
text_dict[key] = 0
elif val == 'neutral':
text_dict[key] = 1
else:
text_dict[key] = 2
return text_dict
def split_data(data: dict) -> (list, list, list, list):
"""
Splits data into train/test
:param data: maps labels to data
:return: X_train, X_test, y_train, y_test
"""
X_train, X_test, y_train, y_test = train_test_split(list(data.keys()),
list(data.values()),
test_size=0.33,
random_state=42)
return X_train, X_test, y_train, y_test
def create_df(source: list, target: list) -> pd.DataFrame:
"""
Creates pandas DataFrame from data as follows:
Text Sentiment
0 <sentence> <sentiment>
... ... ...
N <sentence> <sentiment>
"""
return pd.DataFrame(data=[[txt, sent] for txt, sent in zip(source, target)],
columns=['Text', 'Sentiment'])
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
return {
"input_ids": input_ids,
"token_type_ids": token_type_ids,
"attention_mask": attention_masks,
}, label
def convert_example_to_feature(txt):
return tz.encode_plus(
txt,
add_special_tokens=True,
max_length=160,
truncation=True,
return_token_type_ids=True,
return_attention_mask=True,
padding=True,
)
def encode_examples(ds, limit=-1):
# prepare list, so that we can build up final TensorFlow dataset from slices.
input_ids_list = []
token_type_ids_list = []
attention_mask_list = []
label_list = []
if limit > 0:
ds = ds.take(limit)
for txt, label in tfds.as_numpy(ds):
bert_input = convert_example_to_feature(str(txt[0].decode("UTF-8")))
print(bert_input, "\n")
input_ids_list.append(bert_input['input_ids'])
token_type_ids_list.append(bert_input['token_type_ids'])
attention_mask_list.append(bert_input['attention_mask'])
label_list.append([label])
return tf.data.Dataset.from_tensor_slices(
(input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)
if __name__ == "__main__":
data = combine_files("neg_sentiment.txt", "neu_sentiment.txt", "pos_sentiment.txt")
encoded = encode_data(data)
X_train, X_test, y_train, y_test = split_data(encoded)
train = create_df(X_train, y_train)
test = create_df(X_test, y_test)
# print(train["Text"].head())
tz = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
train_target, test_target = train.pop('Sentiment'), test.pop('Sentiment')
ds_train = tf.data.Dataset.from_tensor_slices((train.values, train_target.values))
ds_test = tf.data.Dataset.from_tensor_slices((test.values, test_target.values))
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(BATCH_SIZE)
ds_test_encoded = encode_examples(ds_test).batch(BATCH_SIZE)
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
目前没有回答
相关问题 更多 >
编程相关推荐