我一直在尝试研究如何使用Keras来训练POS标记器;具体来说,我希望它使用LSTM体系结构,并使用单词嵌入,即手套。我从两个博客中得到了灵感。一种是使用带有预训练嵌入件的LSTM来执行POS;另一种是使用LSTM和单词嵌入对文本进行分类
https://nlpforhackers.io/lstm-pos-tagger-keras/
https://nlpforhackers.io/keras-intro/
下面的脚本“起作用”的意思是不会触发错误,但是,它会过高预测“填充”单元格,过低预测其他标记。(当逐字跟踪POS博客时,准确率约为99%)我不明白为什么添加单词嵌入会如此糟糕地影响性能
数据预处理:
import nltk
tagged_sentences = nltk.corpus.treebank.tagged_sents()
import numpy as np
sentences, sentence_tags =[], []
for tagged_sentence in tagged_sentences:
sentence, tags = zip(*tagged_sentence)
sentences.append(np.array(sentence))
sentence_tags.append(np.array(tags))
from sklearn.model_selection import train_test_split
(train_sentences, test_sentences,
train_tags, test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)
def assemble_text(array):
return ' '.join([word for word in array])
train_sentences = [assemble_text(arr) for arr in train_sentences]
test_sentences = [assemble_text(arr) for arr in test_sentences]
tags = set([])
for ts in train_tags:
for t in ts:
tags.add(t)
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0 # The special value used to padding
train_tags_y = []
for s in train_tags:
train_tags_y.append([tag2index[t] for t in s])
test_tags_y= []
for s in test_tags:
test_tags_y.append([tag2index[t] for t in s])
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True,
lowercase=True, min_df=3, max_df=0.9, max_features=5000)
X_train_onehot = vectorizer.fit_transform(train_sentences)
word2idx = {word: idx for idx, word in enumerate(vectorizer.get_feature_names())}
tokenize = vectorizer.build_tokenizer()
preprocess = vectorizer.build_preprocessor()
def to_sequence(tokenizer, preprocessor, index, text):
words = tokenizer(preprocessor(text))
indexes = [index[word] for word in words if word in index]
return indexes
X_train_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in train_sentences]
X_test_sequences = [to_sequence(tokenize, preprocess, word2idx, x) for x in test_sentences]
# Compute the max lenght of a text
MAX_SEQ_LENGHT = len(max(X_train_sequences, key=len))
print("MAX_SEQ_LENGHT=", MAX_SEQ_LENGHT)
from tensorflow.keras.preprocessing.sequence import pad_sequences
N_FEATURES = len(vectorizer.get_feature_names())
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_sequences = pad_sequences(X_train_sequences, maxlen=MAX_SEQ_LENGHT, padding='post')
X_test_sequences = pad_sequences(X_test_sequences, maxlen=MAX_SEQ_LENGHT, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_SEQ_LENGHT, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_SEQ_LENGHT, padding='post')
def to_categorical(sequences, categories):
cat_sequences = []
for s in sequences:
cats = []
for item in s:
cats.append(np.zeros(categories))
cats[-1][item] = 1.0
cat_sequences.append(cats)
return np.array(cat_sequences)
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))
cat_test_tags_y = to_categorical(test_tags_y, len(tag2index))
导入字向量
import numpy as np
GLOVE_PATH = '/Users/jdmoore7/Downloads/glove.6B/glove.6B.50d.txt'
GLOVE_VECTOR_LENGHT = 50
def read_glove_vectors(path, lenght):
embeddings = {}
with open(path) as glove_f:
for line in glove_f:
chunks = line.split()
assert len(chunks) == lenght + 1
embeddings[chunks[0]] = np.array(chunks[1:], dtype='float32')
return embeddings
GLOVE_INDEX = read_glove_vectors(GLOVE_PATH, GLOVE_VECTOR_LENGHT)
# Init the embeddings layer with GloVe embeddings
embeddings_index = np.zeros((len(vectorizer.get_feature_names()) + 1, GLOVE_VECTOR_LENGHT))
for word, idx in word2idx.items():
try:
embedding = GLOVE_INDEX[word]
embeddings_index[idx+1] = embedding
except:
pass
模型和精度度量
from tensorflow.keras import backend as K
def ignore_class_accuracy(to_ignore=0):
def ignore_accuracy(y_true, y_pred):
y_true_class = K.argmax(y_true, axis=-1)
y_pred_class = K.argmax(y_pred, axis=-1)
ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
return accuracy
return ignore_accuracy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Dropout
from tensorflow.keras.optimizers import Adam
model = Sequential()
model.add(InputLayer(input_shape=(MAX_SEQ_LENGHT, )))
model.add(Embedding(len(vectorizer.get_feature_names()) + 1,
GLOVE_VECTOR_LENGHT, # Embedding size
weights=[embeddings_index],
input_length=MAX_SEQ_LENGHT,
trainable=False))
model.add(Bidirectional(LSTM(256, activation='relu', return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy',
optimizer=Adam(0.001),
metrics=['accuracy',ignore_class_accuracy(0)])
model.fit(X_train_sequences, cat_train_tags_y,
epochs=40, batch_size=128, verbose=1,
validation_data=(X_test_sequences, cat_test_tags_y))
def logits_to_tokens(sequences, index):
token_sequences = []
for categorical_sequence in sequences:
token_sequence = []
for categorical in categorical_sequence:
token_sequence.append(index[np.argmax(categorical)])
token_sequences.append(token_sequence)
return token_sequences
import string
def pipe(text):
words = ''.join([char.lower() for char in text if char not in string.punctuation]).split(' ')
arr = [to_sequence(tokenize, preprocess, word2idx, text) ]
arr = pad_sequences(arr, maxlen=MAX_SEQ_LENGHT, padding='post')
pred = model.predict(arr)
values = logits_to_tokens(pred,
{i: t for t, i in tag2index.items()})[0]
return [(w,t) for w,t in zip(words,values)]
pipe('the walk down the hill')
>>>
[('the', '-PAD-'),
('walk', '-PAD-'),
('down', '-PAD-'),
('the', '-PAD-'),
('hill', '-PAD-')]
模型拟合的精度为0.00%。所以我只能得出结论,我在某种程度上使用了单词嵌入错误;问题是,我的模型架构有缺陷吗?或者我处理单词嵌入本身的方式有缺陷吗?还是别的什么
目前没有回答
相关问题 更多 >
编程相关推荐