Keras/tensorflow“值错误：生成器的输出应该是元组…”第一个历元之后的错误

import numpy as np from keras.models import Model from keras.layers.recurrent import LSTM from keras.layers.embeddings import Embedding from keras.layers.wrappers import TimeDistributed from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.text import Tokenizer from keras.layers import Activation, Dense, RepeatVector, Input, merge import json data = json.load(open('../data/en_de_corpus.json', 'r')) # to deal with memory issues, # limit the dataset # we could also generate the training samples on-demand # with a generator and use keras models' `fit_generator` method max_len = 6 max_examples = 80000 max_vocab_size = 10000 def get_texts(source_texts, target_texts, max_len, max_examples): """extract texts training gets difficult with widely varying lengths since some sequences are mostly padding long sequences get difficult too, so we are going to cheat and just consider short-ish sequences. this assumes whitespace as a token delimiter and that the texts are already aligned. """ sources, targets = [], [] for i, source in enumerate(source_texts): # assume we split on whitespace if len(source.split(' ')) <= max_len: target = target_texts[i] if len(target.split(' ')) <= max_len: sources.append(source) targets.append(target) return sources[:max_examples], targets[:max_examples] en_texts, de_texts = get_texts(data['en'], data['de'], max_len, max_examples) n_examples = len(en_texts) # add start and stop tokens start_token = '^' end_token = '$' en_texts = [' '.join([start_token, text, end_token]) for text in en_texts] de_texts = [' '.join([start_token, text, end_token]) for text in de_texts] # characters for the tokenizers to filter out # preserve start and stop tokens filter_chars = '!"#$%&()*+,-./:;<=>?@[\\]^_{|}~\t\n\'`“”–'.replace(start_token, '').replace(end_token, '') source_tokenizer = Tokenizer(max_vocab_size, filters=filter_chars) source_tokenizer.fit_on_texts(en_texts) target_tokenizer = Tokenizer(max_vocab_size, filters=filter_chars) target_tokenizer.fit_on_texts(de_texts) # vocab sizes # idx 0 is reserved by keras (for padding) # and not part of the word_index, # so add 1 to account for it source_vocab_size = len(source_tokenizer.word_index) + 1 target_vocab_size = len(target_tokenizer.word_index) + 1 # find max length (in tokens) of input and output sentences max_input_length = max(len(seq) for seq in source_tokenizer.texts_to_sequences_generator(en_texts)) max_output_length = max(len(seq) for seq in target_tokenizer.texts_to_sequences_generator(de_texts)) sequences = pad_sequences(source_tokenizer.texts_to_sequences(en_texts[:1]), maxlen=max_input_length) print(en_texts[0]) # >>> ^ I took the bus back. $ print(sequences[0]) # >>> [ 0 0 0 2 4 223 3 461 114 1] def build_one_hot_vecs(sequences): """generate one-hot vectors from token sequences""" # boolean to reduce memory footprint X = np.zeros((len(sequences), max_input_length, source_vocab_size), dtype=np.bool) for i, sent in enumerate(sequences): word_idxs = np.arange(max_input_length) X[i][[word_idxs, sent]] = True return X def build_target_vecs(): """encode words in the target sequences as one-hots""" y = np.zeros((n_examples, max_output_length, target_vocab_size), dtype=np.bool) for i, sent in enumerate(pad_sequences(target_tokenizer.texts_to_sequences(de_texts), maxlen=max_output_length)): word_idxs = np.arange(max_output_length) y[i][[word_idxs, sent]] = True return y hidden_dim = 128 embedding_dim = 128 def build_model(one_hot=False, bidirectional=False): """build a vanilla sequence-to-sequence model. specify `one_hot=True` to build it for one-hot encoded inputs, otherwise, pass in sequences directly and embeddings will be learned. specify `bidirectional=False` to use a bidirectional LSTM""" if one_hot: input = Input(shape=(max_input_length,source_vocab_size)) input_ = input else: input = Input(shape=(max_input_length,), dtype='int32') input_ = Embedding(source_vocab_size, embedding_dim, input_length=max_input_length)(input) # encoder; don't return sequences, just give us one representation vector if bidirectional: forwards = LSTM(hidden_dim, return_sequences=False)(input_) backwards = LSTM(hidden_dim, return_sequences=False, go_backwards=True)(input_) encoder = merge([forwards, backwards], mode='concat', concat_axis=-1) else: encoder = LSTM(hidden_dim, return_sequences=False)(input_) # repeat encoder output for each desired output from the decoder encoder = RepeatVector(max_output_length)(encoder) # decoder; do return sequences (timesteps) decoder = LSTM(hidden_dim, return_sequences=True)(encoder) # apply the dense layer to each timestep # give output conforming to target vocab size decoder = TimeDistributed(Dense(target_vocab_size))(decoder) # convert to a proper distribution predictions = Activation('softmax')(decoder) return Model(input=input, output=predictions) target_reverse_word_index = {v:k for k,v in target_tokenizer.word_index.items()} def decode_outputs(predictions): outputs = [] for probs in predictions: preds = probs.argmax(axis=-1) tokens = [] for idx in preds: tokens.append(target_reverse_word_index.get(idx)) outputs.append(' '.join([t for t in tokens if t is not None])) return outputs def build_seq_vecs (sequences): return np.array(sequences) import math def generate_batches(batch_size, one_hot=False): # each epoch n_batches = math.ceil(n_examples/batch_size) while True: sequences = pad_sequences(source_tokenizer.texts_to_sequences(en_texts), maxlen=max_input_length) if one_hot: X = build_one_hot_vecs(sequences) else: X = build_seq_vecs(sequences) y = build_target_vecs() # shuffle idx = np.random.permutation(len(sequences)) X = X[idx] y = y[idx] for i in range(n_batches): start = batch_size * i end = start+batch_size yield X[start:end], y[start:end] n_epochs = 100 batch_size = 128 model = build_model(one_hot=False, bidirectional=False) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit_generator(generator=generate_batches(batch_size, one_hot=False), samples_per_epoch=n_examples, nb_epoch=n_epochs, verbose=1) def translate(model, sentences, one_hot=False): seqs = pad_sequences(source_tokenizer.texts_to_sequences(sentences), maxlen=max_input_length) if one_hot: input = build_one_hot_vecs(seqs) else: input = build_seq_vecs(seqs) preds = model.predict(input, verbose=0) return decode_outputs(preds) print(en_texts[0]) print(de_texts[0]) print(translate(model, [en_texts[0]], one_hot=True)) # >>> ^ I took the bus back. $ # >>> ^ Ich nahm den Bus zurück. $ # >>> ^ ich ich die die verloren $

1条回答

网友

1楼 · 发布于 2024-05-16 05:44:29

您可以使用以下工具测试发电机：

next(generate_batches(batch_size, one_hot=False))

如果它在这种情况下有效，你应该看看内存消耗情况。因为seq2seq2.py抛出了一个MemoryError，这也可能是问题的根源。可能你的生成器没有返回，因为如果这个。在

顺便说一句，在Keras中，您可以使用LSTM Layerwrappers（双向），它可以手动完成您的工作。在

相关问题更多 >

编程相关推荐

热门问题

热门文章