为什么我的VQA网络表现如此糟糕?

2024-04-26 14:33:33 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试使用Clevr数据集(https://cs.stanford.edu/people/jcjohns/clevr/)为深度学习作业构建一个可视化的问答模型(不过,我不能使用函数式程序表示)。你知道吗

然而,我很挣扎,因为我的网络没有正确地学习,因为它在训练和验证集上继续在0.2的精确度上振荡。而且,我第一次运行它时,它的精确度已经提高到了0.4,具有类似的体系结构,但是,我怀疑,一些不同的东西,从嵌入矩阵构造和标记器的试验中保留在内存中。你知道吗

我已经尝试过改变嵌入(现在我正在使用手套),改变尺寸,以多种方式改变网络(我还需要尝试注意力和更高级的东西,但首先我希望看到它正常工作)。 我确信存在某种致命的错误(架构也很幼稚),但我似乎无法发现它。你能帮我理解什么不起作用吗?你知道吗

我会留下我的网络和数据输入管道下面的代码,请评论,甚至指出我在哪里使用一些不好的做法。我很抱歉,如果我会留下很多代码,但我真的不明白我哪里做错了。你知道吗

先谢谢你。你知道吗

这是网络的密码

import tensorflow as tf

batch_size = 8
epochs = 100

#arch =  tf.keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
arch =  tf.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))

freeze_until = 650
for layer in arch.layers[:freeze_until]:
      layer.trainable = False
branch1 = arch.output
branch1 = tf.keras.layers.GlobalAveragePooling2D(data_format=None) (branch1)



text_inputs = tf.keras.Input(shape=[max_words])

emb = tf.keras.layers.Embedding(vocab_size,embedding_dim, 
                           input_length=max_words, 
                           weights=[embedding_matrix], 
                           trainable=False)  (text_inputs)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(emb)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
#branch2 = tf.keras.layers.Conv1D(128, 5, activation='relu')(emb)
branch2 = tf.keras.layers.LSTM(128)(emb)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(branch2)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)


joint = tf.keras.layers.concatenate([branch1, branch2])
joint = tf.keras.layers.Dense(512, activation='relu')(joint)
joint = tf.keras.layers.Dropout(0.2)(joint)
predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint)

model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions])

model.summary()

loss = tf.keras.losses.CategoricalCrossentropy()
lr = 5e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

model.compile(loss = loss,
                   optimizer = optimizer,
                   metrics = ['accuracy'])

callbacks=[]
callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True))
callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))
model.fit_generator(data_generator('train'),validation_data = data_generator('validation'), steps_per_epoch= 240, validation_steps = 120, epochs=epochs, callbacks=callbacks,  verbose=1)

下面是生成器+嵌入的代码

import json
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

img_h = 320
img_w = 480 
max_words = 100
embedding_dim = 40
num_classes = 13
val_split = 0.8
max_len = 25

classes = [ '0',
           '1',
           '10',
           '2',
           '3',
           '4',
           '5',
           '6',
           '7',
           '8',
           '9',
           'no',
           'yes'
         ]



label_encoder = LabelEncoder()
integer_encoder_ = label_encoder.fit(classes)
integer_encoded = integer_encoder_.transform(classes)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder_ = onehot_encoder.fit(integer_encoded)


def data_generator(mode, batch_size = 8):


   with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
         data_raw = json.load(f)
   f.close()
   while True:
       # Select files (paths/indices) for the batch

       if mode == 'validation':
           batch_addresses = random.sample(range(int(len(data_raw['questions'])*val_split),len(data_raw['questions'])), batch_size)
       elif mode == 'train':
           batch_addresses = random.sample(range(0, int(len(data_raw['questions'])*val_split)), batch_size)
       else:
            batch_addresses = random.sample(range(0, len(data_raw['questions'])), batch_size)

       batch_input_img = []
       batch_input_txt = []
       batch_output = [] 

       for i in batch_addresses:

           image_name = data_raw['questions'][i]['image_filename']
           img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/' + image_name).convert('RGB')
           img_array = np.array(img)
           img_array = np.expand_dims(img_array, 0)
           input_img = np.true_divide(img_array,255)

           input_txt = data_raw['questions'][i]['question']

           output = data_raw['questions'][i]['answer']

           batch_input_img += [ input_img ]
           batch_input_txt += [ input_txt ]

           # Return a tuple of (input,output) to feed the network
           batch_x_img = np.array( batch_input_img )
           batch_x_txt = np.array( batch_input_txt )

       batch_x_img = batch_x_img[:,-1]    

       tokenized = tokenizer.texts_to_sequences(batch_x_txt)
       batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) 

       batch_output += [ output ]
       batch_y = np.array( batch_output )
       y_c = integer_encoder_.transform(batch_y)
       y_c = y_c.reshape(len(y_c), 1)
       onehot_encoded = onehot_encoder_.transform(y_c)

       batch_y = onehot_encoded



       yield ([batch_x_img,batch_x_txt], batch_y )

def test_generator():


   with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f:
         data_raw = json.load(f)
   f.close()
   i = 0
   while (i<=len(data_raw['questions'])):

       batch_input_img = []
       batch_input_txt = []
       batch_output = [] 

       image_name = data_raw['questions'][i]['image_filename']
       img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/' + image_name).convert('RGB')
       img_array = np.array(img)
       img_array = np.expand_dims(img_array, 0)
       input_img = np.true_divide(img_array,255)

       input_txt = data_raw['questions'][i]['question']

       output = data_raw['questions'][i]['question_id']

       batch_input_img += [ input_img ]
       batch_input_txt += [ input_txt ]

       # Return a tuple of (input,output) to feed the network
       batch_x_img = np.array( batch_input_img )
       batch_x_txt = np.array( batch_input_txt )

       batch_x_img = batch_x_img[:,-1]    

       tokenized = tokenizer.texts_to_sequences(batch_x_txt)
       batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) 

       batch_y = output

       i+=1



       yield ([batch_x_img,batch_x_txt], batch_y )


def create_embedding_matrix(filepath, word_index, embedding_dim):
   vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
   embedding_matrix = np.zeros((vocab_size, embedding_dim))

   with open(filepath) as f:
       count = 0
       for line in f:
           word, *vector = line.split()
           if word in word_index and count<(len(word_index)-1):
               idx = word_index[word] 
               embedding_matrix[idx] = np.array(
                   vector, dtype=np.float32)[:embedding_dim]
               count = count + 1
   # errore perché va a splittare e trova to name.domain

   return embedding_matrix

def create_tokens(tokenizer):
   with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
         data_raw = json.load(f)
   f.close()
   tot_txt = []
   for i in range(len(data_raw['questions'])):
       input_txt = data_raw['questions'][i]['question']
       tot_txt += [input_txt]

   tokenizer.fit_on_texts(tot_txt)
   return tokenizer

tokenizer = Tokenizer(num_words=max_words,oov_token = 'OOV')
tokenizer = create_tokens(tokenizer)
#embedding_matrix = create_embedding_matrix('/kaggle/input/embedding/embedding.txt', tokenizer.word_index, embedding_dim)
import os
filepath = "../input/glove840b300dtxt/" + os.listdir("../input/glove840b300dtxt/")[0]

embedding_matrix = create_embedding_matrix(filepath, tokenizer.word_index, embedding_dim)
vocab_size = len(tokenizer.word_index) + 1

reader = data_generator('train')

PS我原以为将GlobalAveragePooling层改为平坦层可以解决这个问题,但事实并非如此


Tags: txtimginputdatarawlenlayerstf
1条回答
网友
1楼 · 发布于 2024-04-26 14:33:33

我认为这对很多人来说并不有趣,但我发现了错误。在生成器中,我将batch_output += [ output ]从for循环中去掉,for循环在批处理中循环,因此,由于Python的通融,我没有出错,但是批处理的所有问题都有相同的答案,这对于几乎任何问题都是错误的。这使得网络保持在与训练数据中最现有答案相对应的局部最小值。 现在,虽然它的性能不好,但它完成了它的工作,在Clevr数据集上获得了~0.5的精确度。你知道吗

相关问题 更多 >