如何向Keras POS标记器添加手套词嵌入?

2024-04-19 12:25:51 发布

您现在位置:Python中文网/ 问答频道 /正文

我一直在尝试研究如何使用Keras来训练POS标记器;具体来说,我希望它使用LSTM体系结构,并使用单词嵌入,即手套。我从两个博客中得到了灵感。一种是使用带有预训练嵌入件的LSTM来执行POS;另一种是使用LSTM和单词嵌入对文本进行分类

https://nlpforhackers.io/lstm-pos-tagger-keras/

https://nlpforhackers.io/keras-intro/

下面的脚本“起作用”的意思是不会触发错误,但是,它会过高预测“填充”单元格,过低预测其他标记。(当逐字跟踪POS博客时,准确率约为99%)我不明白为什么添加单词嵌入会如此糟糕地影响性能

数据预处理:

import nltk
tagged_sentences = nltk.corpus.treebank.tagged_sents()

import numpy as np
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

from sklearn.model_selection import train_test_split

(train_sentences, test_sentences, 
 train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

def assemble_text(array):
    return ' '.join([word for word in array])

train_sentences = [assemble_text(arr) for arr in train_sentences]
test_sentences = [assemble_text(arr) for arr in test_sentences]

tags = set([])        
for ts in train_tags:
    for t in ts:
        tags.add(t)

tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

train_tags_y = []
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

test_tags_y= []    
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, 
                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)
X_train_onehot = vectorizer.fit_transform(train_sentences)
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}

tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()

def to_sequence(tokenizer, preprocessor, index, text):
    words = tokenizer(preprocessor(text))
    indexes = [index[word] for word in words if word in index]
    return indexes

X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in train_sentences]
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in test_sentences]

# Compute the max lenght of a text
MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGHT=", MAX_SEQ_LENGHT)

from tensorflow.keras.preprocessing.sequence import pad_sequences
N_FEATURES = len(vectorizer.get_feature_names())
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, padding='post')
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGHT, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_SEQ_LENGHT, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_SEQ_LENGHT, padding='post')

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
cat_test_tags_y = to_categorical(test_tags_y, len(tag2index))

导入字向量

import numpy as np
GLOVE_PATH = '/Users/jdmoore7/Downloads/glove.6B/glove.6B.50d.txt'
GLOVE_VECTOR_LENGHT = 50

def read_glove_vectors(path, lenght):
    embeddings = {}
    with open(path) as glove_f:
        for line in glove_f:
            chunks = line.split()
            assert len(chunks) == lenght + 1
            embeddings[chunks[0]] = np.array(chunks[1:], dtype='float32')
    return embeddings

GLOVE_INDEX = read_glove_vectors(GLOVE_PATH, GLOVE_VECTOR_LENGHT)

# Init the embeddings layer with GloVe embeddings
embeddings_index = np.zeros((len(vectorizer.get_feature_names()) + 1, GLOVE_VECTOR_LENGHT))
for word, idx in word2idx.items():
    try:
        embedding = GLOVE_INDEX[word]
        embeddings_index[idx+1] = embedding
    except:
        pass

模型和精度度量


from tensorflow.keras import backend as K

def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Dropout    
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(InputLayer(input_shape=(MAX_SEQ_LENGHT, )))
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
                    GLOVE_VECTOR_LENGHT,  # Embedding size
                    weights=[embeddings_index],
                    input_length=MAX_SEQ_LENGHT,
                    trainable=False))

model.add(Bidirectional(LSTM(256, activation='relu', return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy',ignore_class_accuracy(0)])

model.fit(X_train_sequences, cat_train_tags_y, 
          epochs=40, batch_size=128, verbose=1, 
          validation_data=(X_test_sequences, cat_test_tags_y))

def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
        token_sequences.append(token_sequence)
    return token_sequences

import string
def pipe(text):
    words = ''.join([char.lower() for char in text if char not in string.punctuation]).split(' ') 
    arr = [to_sequence(tokenize, preprocess, word2idx, text) ]
    arr = pad_sequences(arr, maxlen=MAX_SEQ_LENGHT, padding='post')
    pred = model.predict(arr)
    values = logits_to_tokens(pred,
            {i: t for t, i in tag2index.items()})[0]
    return [(w,t) for w,t in zip(words,values)]

pipe('the walk down the hill')
>>>
[('the', '-PAD-'),
 ('walk', '-PAD-'),
 ('down', '-PAD-'),
 ('the', '-PAD-'),
 ('hill', '-PAD-')]

模型拟合的精度为0.00%。所以我只能得出结论,我在某种程度上使用了单词嵌入错误;问题是,我的模型架构有缺陷吗?或者我处理单词嵌入本身的方式有缺陷吗?还是别的什么


Tags: totextintestimportformodeltags