如何解决TypeError:无法使用类型列表为<encoded text>构建TypeSpec

2024-04-23 08:58:15 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在使用BERT和Tensorflow 2.*在Python中创建一个多类分类器,但是我在当前代码中遇到了两个错误,我整天都在使用它们。我遇到的第一个错误是:

TypeError: Could not build a TypeSpec for [[<encoded text>], [...]] with type list

其中[<encoded text>]看起来像:

[101, 1997, 5142, 2000, 1996, 5438, 10994, 2003, 3465, 1997, 5438, 2004, 2092, 8521, 2011, 1011, 4031, 2030, 2512, 1011, 3151, 14172, 2029, 2097, 4254, 14266, 1998, 2740, 102]

我得到的第二个错误是:

During handling of the above exception, another exception occurred:
ValueError: Can't convert non-rectangular Python sequence to Tensor.

对于第二个错误,我正在考虑使用参差不齐的张量,因此对此的任何帮助都将不胜感激。然而,由于我花了这么长时间,我主要集中在找出第一个错误上

我的错误来自encode_examples函数,我的脚本是:

import csv
import pandas as pd
import tensorflow_datasets as tfds
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

# Hyperparameters
BATCH_SIZE = 1
LEARNING_RATE = 2e-5  # recommended learning rate for Adam 5e-5, 3e-5, 2e-5
NUMBER_OF_EPOCHS = 1

tf.compat.v1.enable_eager_execution()


def get_input_data(file_name, is_labelled=False):
    """
    Returns file data

    :param file_name: name of file to be read
    :param is_labelled: True if file name is labelled
    :return: input_list: list containing file data
    """
    input_list = []
    if is_labelled:
        input_list = {}
    with open(file_name, encoding='utf-8') as f:
        if is_labelled:
            file_list = list(f)
            label = file_list[0].rstrip()
            sentences = file_list[1:]
            for sentence in sentences:
                input_list[sentence] = label
        else:
            rows = iter(csv.reader(f))
            next(rows)
            for row in rows:
                if len(row) < 2:
                    continue
                input_list.append([e.replace("_", " ") for e in row])
    return input_list


def combine_files(*args) -> dict:
    """
    Combines files and returns list of their data
    :return: combined_inputs List of input data from files
    """
    combined_data = {}
    for file in args:
        combined_data.update(get_input_data(file, True))
    return combined_data


def encode_data(text_dict: dict) -> dict:
    """
    Converts labels into integers
    :param text_dict: mapping of text to label
    :return: mapping of text to integer label
    """
    for key in text_dict.keys():
        val = text_dict[key]
        if val == 'negative':
            text_dict[key] = 0
        elif val == 'neutral':
            text_dict[key] = 1
        else:
            text_dict[key] = 2
    return text_dict


def split_data(data: dict) -> (list, list, list, list):
    """
    Splits data into train/test
    :param data: maps labels to data
    :return: X_train, X_test, y_train, y_test
    """
    X_train, X_test, y_train, y_test = train_test_split(list(data.keys()),
                                                        list(data.values()),
                                                        test_size=0.33,
                                                        random_state=42)
    return X_train, X_test, y_train, y_test


def create_df(source: list, target: list) -> pd.DataFrame:
    """
    Creates pandas DataFrame from data as follows:

           Text            Sentiment
    0      <sentence>      <sentiment>
    ...    ...             ...
    N      <sentence>      <sentiment>
    """
    return pd.DataFrame(data=[[txt, sent] for txt, sent in zip(source, target)],
                        columns=['Text', 'Sentiment'])


def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
               "input_ids": input_ids,
               "token_type_ids": token_type_ids,
               "attention_mask": attention_masks,
           }, label


def convert_example_to_feature(txt):
    return tz.encode_plus(
        txt,
        add_special_tokens=True,
        max_length=160,
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        padding=True,
    )


def encode_examples(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if limit > 0:
        ds = ds.take(limit)

    for txt, label in tfds.as_numpy(ds):
        bert_input = convert_example_to_feature(str(txt[0].decode("UTF-8")))
        print(bert_input, "\n")

        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices(
        (input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


if __name__ == "__main__":
    data = combine_files("neg_sentiment.txt", "neu_sentiment.txt", "pos_sentiment.txt")
    encoded = encode_data(data)
    X_train, X_test, y_train, y_test = split_data(encoded)
    train = create_df(X_train, y_train)
    test = create_df(X_test, y_test)
    # print(train["Text"].head())

    tz = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

    train_target, test_target = train.pop('Sentiment'), test.pop('Sentiment')
    ds_train = tf.data.Dataset.from_tensor_slices((train.values, train_target.values))
    ds_test = tf.data.Dataset.from_tensor_slices((test.values, test_target.values))

    ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(BATCH_SIZE)
    ds_test_encoded = encode_examples(ds_test).batch(BATCH_SIZE)

    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, epsilon=1e-08)

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Tags: totextfromtestidsforinputdata