张量流lstm双向低精度反馈训练数据

2024-03-28 22:51:24 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在学习张量流和双向LSTM网络。 我训练了一个模型(代码如下)来学习文本中的下一个字符。你知道吗

我生成一个由a,b和c字符组成的伪序列,这些字符具有不同的转移概率。你知道吗

LSTM的训练似乎工作得很好,而且收敛得很快(这是正常的还是应该对我发出警告?我猜这是正常的,因为我的文本很短,复杂性很简单)。你知道吗

我的问题是当我重新使用经过训练的网络时。 我关闭train会话,重置图形并使用Tensorflow的saver方法将其加载回。你知道吗

当我用来训练我的模型的数据被输入到同一个模型时(但是批量大小是1,因为我预测的是当前模型的下一个字符),与训练模型的精度相比,我得到的整个文本的精度非常低。你知道吗

我知道我做错了什么事,但一根手指也插不上。 有人能告诉我如何改进我的模型吗?你知道吗

代码(使用Tensorflow 1.2):

from __future__ import division, print_function

import tensorflow as tf
import numpy as np
import os, sys, argparse

def dummy_sequence(dummy_length):
    """ generate a round of dummy sequences
    """
    seq = ""
    p_abc_ori = np.asarray([0.7, 0.2, 0.1])
    p_abc_trans = np.asarray(
                    [[0.6, 0.3, 0.1],
                    [0.3, 0.5, 0.2],
                    [0.8, 0.1, 0.1]])
    chars = ["a", "b", "c"]
    positions = {"a": 0, "b": 1, "c": 2}
    c = np.random.choice(chars, p=p_abc_ori)
    seq += c
    for i in range(dummy_length):
        c = np.random.choice(chars, p = p_abc_trans[positions[c]])
        seq += c
    return seq


data_seqs = dummy_sequence(40000)
data_length = len(data_seqs) 

char_set = set()
for ch in data_seqs:
    char_set.add(ch)

char_list = sorted(list(char_set))

char2idx = dict(zip(char_list, range(len(char_list))))
idx2char = dict(zip(range(len(char_list)), char_list))


def sample_generator(data_seqs, char_dict, batch_size, sequence_length):
    data_length = len(data_seqs) 
    length = sequence_length + 1
    num_steps = (data_length // batch_size)

    for step in range(num_steps):
        start_idxs = np.random.random_integers(0, data_length, batch_size)
        input_batch = np.zeros((batch_size, sequence_length), dtype=np.int32)
        target_batch = np.zeros((batch_size, sequence_length), dtype=np.int32)
        for i, start_idx in enumerate(start_idxs):
            sample = [char_dict[data_seqs[i % data_length]] for i in range(start_idx, start_idx+length)]
            input_batch[i, :] = sample[0:sequence_length] 
            target_batch[i, :] = sample[1:sequence_length+1]
            start_idxs = (start_idxs + sequence_length) % data_length 
        yield input_batch, target_batch 

# parameters        
sequence_length = 150
batch_size = 200
number_of_characters = len(char_set)
hidden_size = 512
dropout = 0.8
learning_rate = 2e-3 

class Model:

    def __init__(self, batch_size, sequence_length, hidden_size, 
                number_of_characters, learning_rate, dropout, 
                is_training=False):
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.dropout = dropout
        self.number_of_characters = number_of_characters

        # placeholder for X and Y
        self._inputs = tf.placeholder(tf.int32, [self.batch_size, self.sequence_length], name ="input")   
        self._targets = tf.placeholder(tf.int32, [self.batch_size, self.sequence_length], name="target") 
        one_hot_inputs = tf.one_hot(self._inputs, depth=self.number_of_characters)  


        # Bi-LSTM
        cell_fw = tf.contrib.rnn.DropoutWrapper(
                            tf.contrib.rnn.LSTMCell(self.hidden_size, state_is_tuple=True),
                            output_keep_prob=self.dropout)
        cell_bw = tf.contrib.rnn.DropoutWrapper(
                            tf.contrib.rnn.LSTMCell(self.hidden_size, state_is_tuple=True),
                            output_keep_prob=self.dropout)

        self._initial_state_fw = cell_fw.zero_state(self.batch_size, tf.float32) 
        self._initial_state_bw = cell_bw.zero_state(self.batch_size, tf.float32)


        lstm_output, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,
                                                                one_hot_inputs,
                                                                initial_state_fw=self.initial_state_fw,
                                                                initial_state_bw=self.initial_state_bw)

        lstm_output_fw, lstm_output_bw = lstm_output 
        final_state_fw, final_state_bw = final_state 

        # concatenate fw and bw layer
        lstm_output = tf.concat(lstm_output, axis=2)
        # apply dense to reshape
        lstm_dense = tf.layers.dense(inputs=lstm_output, units=self.hidden_size, activation=tf.nn.tanh) 
        # concatenate with input
        lstm_output = tf.concat((lstm_dense, one_hot_inputs), axis=2)
        # apply dense to reshape
        lstm_output = tf.layers.dense(lstm_output, units=self.number_of_characters, activation=tf.nn.softmax)

        # compute logits and probabilities
        self._logits_flat = tf.reshape(lstm_output, (-1, self.number_of_characters)) 
        probabilities_flat = tf.nn.softmax(self.logits_flat)
        self._probabilities = tf.reshape(probabilities_flat, (self.batch_size, -1, self.number_of_characters)) 

        targets_flat = tf.reshape(self.targets, (-1, ))
        correct_pred = tf.equal(tf.argmax(probabilities_flat, 1), tf.cast(tf.round(targets_flat), tf.int64))

        # compute accuracy
        self._accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

        if not is_training:
            return

        # compute loss
        self._loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits_flat, labels=targets_flat)
        self._cost = tf.reduce_mean(self.loss)

        # optimizer
        trainable_variables = tf.trainable_variables()
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        gradients = tf.gradients(self.loss, trainable_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 5)
        self._train_op = optimizer.apply_gradients(zip(gradients, trainable_variables))


    @property
    def inputs(self):
        return self._inputs

    @property
    def targets(self):
        return self._targets

    @property
    def initial_state_fw(self):
        return self._initial_state_fw

    @property
    def initial_state_bw(self):
        return self._initial_state_bw

    @property
    def logits_flat(self):
        return self._logits_flat

    @property
    def probabilities(self):
        return self._probabilities

    @property
    def accuracy(self):
        return self._accuracy

    @property
    def loss(self):
        return self._loss

    @property
    def cost(self):
        return self._cost

    @property
    def train_op(self):
        return self._train_op

########################
# Train 

outputdir = "./tmp"
if not os.path.isdir(outputdir):
    os.makesdir(outputdir)

model = Model(batch_size, sequence_length, hidden_size, number_of_characters, learning_rate, dropout, True)

save_path = os.path.join(outputdir, 'model') 
saver = tf.train.Saver(tf.trainable_variables()) 

init_op = tf.global_variables_initializer()

with tf.Session() as sess:  
    sess.run(init_op)
    state_fw = sess.run(model.initial_state_fw)
    state_bw = sess.run(model.initial_state_bw)
    for epoch in range(5):
        all_acc = list()
        all_loss = list()
        for input_batch, target_batch in sample_generator(data_seqs, char2idx, batch_size, sequence_length):
            feed_dict = {model.inputs: input_batch,
                        model.targets: target_batch}
            computed_cost, computed_accuracy, _ = \
                        sess.run([model.cost, 
                                model.accuracy, 
                                model.train_op],
                                feed_dict=feed_dict)
            all_loss.append(computed_cost)
            all_acc.append(computed_accuracy)
            #print(sum(all_loss), sum(all_acc))
        print('i: {}, loss: {}, accuracy: {}'.format(epoch,     
                        sum(all_loss)/len(all_loss),      
                                          sum(all_acc)/len(all_acc)))
        # print
        # i: 0, loss: 0.6021944493055343, accuracy: 0.9481146431714297
        # i: 1, loss: 0.554149004817009, accuracy: 0.9972950986027718
        # i: 2, loss: 0.5541560265421868, accuracy: 0.9972904279828072
        # i: 3, loss: 0.5541570243239403, accuracy: 0.9972894296050072
        # i: 4, loss: 0.5541545218229293, accuracy: 0.9972919332981109 
    saver.save(sess, save_path)


########################
# Predict on trained variable 


tf.reset_default_graph()
latest_checkpoint = tf.train.latest_checkpoint(outputdir)

model = Model(1, None, hidden_size, number_of_characters, learning_rate, dropout)

saver = tf.train.Saver(tf.trainable_variables())

init_op = tf.global_variables_initializer()  

with tf.Session() as sess: 
    sess.run(init_op)
    saver.restore(sess, latest_checkpoint) 
    all_acc = list()
    for idx, character in enumerate(data_seqs[:-1]): 
        idx_target = char2idx[data_seqs[idx+1]]
        idx_query = char2idx[character]
        feed_dict={model.inputs: np.asarray([[idx_query]]),
                model.targets: np.asarray([[idx_target]])
                }
        out, acc = sess.run([model.probabilities, 
                            model.accuracy],
                            feed_dict=feed_dict)
        all_acc.append(acc)
    print("Global accuracy: {}".format(sum(all_acc)/len(all_acc)))

    # print:
    # Global accuracy: 0.594775     

Tags: selfoutputdatasizemodeltfdefbatch
1条回答
网友
1楼 · 发布于 2024-03-28 22:51:24

看起来您在调用Model()两次,但没有指明变量重用(例如,使用带有reuse=True^{})。这是一种选择,尽管我建议您使用面向对象的Layersas in the official MNIST example构建代码。这样,变量就可以与层对象共存,重用也就很简单了。(请注意,您需要使用更新版本的TensorFlowtf.keras.型号()

还有一个“半途而废”的解决方案^{},它允许您在自动管理重用的同时使用功能层(tf.layers.densetf.layers.Dense是面向对象的版本);变量与模板对象共存。你知道吗

相关问题 更多 >