为什么我的VQA网络表现如此糟糕？

import tensorflow as tf batch_size = 8 epochs = 100 #arch = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3)) arch = tf.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3)) freeze_until = 650 for layer in arch.layers[:freeze_until]: layer.trainable = False branch1 = arch.output branch1 = tf.keras.layers.GlobalAveragePooling2D(data_format=None) (branch1) text_inputs = tf.keras.Input(shape=[max_words]) emb = tf.keras.layers.Embedding(vocab_size,embedding_dim, input_length=max_words, weights=[embedding_matrix], trainable=False) (text_inputs) #branch2 = tf.keras.layers.GlobalMaxPool1D()(emb) #branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2) #branch2 = tf.keras.layers.Conv1D(128, 5, activation='relu')(emb) branch2 = tf.keras.layers.LSTM(128)(emb) #branch2 = tf.keras.layers.GlobalMaxPool1D()(branch2) #branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2) joint = tf.keras.layers.concatenate([branch1, branch2]) joint = tf.keras.layers.Dense(512, activation='relu')(joint) joint = tf.keras.layers.Dropout(0.2)(joint) predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint) model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions]) model.summary() loss = tf.keras.losses.CategoricalCrossentropy() lr = 5e-4 optimizer = tf.keras.optimizers.Adam(learning_rate=lr) model.compile(loss = loss, optimizer = optimizer, metrics = ['accuracy']) callbacks=[] callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True)) callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)) model.fit_generator(data_generator('train'),validation_data = data_generator('validation'), steps_per_epoch= 240, validation_steps = 120, epochs=epochs, callbacks=callbacks, verbose=1)

import json import random import numpy as np from PIL import Image import matplotlib.pyplot as plt from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder img_h = 320 img_w = 480 max_words = 100 embedding_dim = 40 num_classes = 13 val_split = 0.8 max_len = 25 classes = [ '0', '1', '10', '2', '3', '4', '5', '6', '7', '8', '9', 'no', 'yes' ] label_encoder = LabelEncoder() integer_encoder_ = label_encoder.fit(classes) integer_encoded = integer_encoder_.transform(classes) onehot_encoder = OneHotEncoder(sparse=False) integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) onehot_encoder_ = onehot_encoder.fit(integer_encoded) def data_generator(mode, batch_size = 8): with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f: data_raw = json.load(f) f.close() while True: # Select files (paths/indices) for the batch if mode == 'validation': batch_addresses = random.sample(range(int(len(data_raw['questions'])*val_split),len(data_raw['questions'])), batch_size) elif mode == 'train': batch_addresses = random.sample(range(0, int(len(data_raw['questions'])*val_split)), batch_size) else: batch_addresses = random.sample(range(0, len(data_raw['questions'])), batch_size) batch_input_img = [] batch_input_txt = [] batch_output = [] for i in batch_addresses: image_name = data_raw['questions'][i]['image_filename'] img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/' + image_name).convert('RGB') img_array = np.array(img) img_array = np.expand_dims(img_array, 0) input_img = np.true_divide(img_array,255) input_txt = data_raw['questions'][i]['question'] output = data_raw['questions'][i]['answer'] batch_input_img += [ input_img ] batch_input_txt += [ input_txt ] # Return a tuple of (input,output) to feed the network batch_x_img = np.array( batch_input_img ) batch_x_txt = np.array( batch_input_txt ) batch_x_img = batch_x_img[:,-1] tokenized = tokenizer.texts_to_sequences(batch_x_txt) batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) batch_output += [ output ] batch_y = np.array( batch_output ) y_c = integer_encoder_.transform(batch_y) y_c = y_c.reshape(len(y_c), 1) onehot_encoded = onehot_encoder_.transform(y_c) batch_y = onehot_encoded yield ([batch_x_img,batch_x_txt], batch_y ) def test_generator(): with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f: data_raw = json.load(f) f.close() i = 0 while (i<=len(data_raw['questions'])): batch_input_img = [] batch_input_txt = [] batch_output = [] image_name = data_raw['questions'][i]['image_filename'] img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/' + image_name).convert('RGB') img_array = np.array(img) img_array = np.expand_dims(img_array, 0) input_img = np.true_divide(img_array,255) input_txt = data_raw['questions'][i]['question'] output = data_raw['questions'][i]['question_id'] batch_input_img += [ input_img ] batch_input_txt += [ input_txt ] # Return a tuple of (input,output) to feed the network batch_x_img = np.array( batch_input_img ) batch_x_txt = np.array( batch_input_txt ) batch_x_img = batch_x_img[:,-1] tokenized = tokenizer.texts_to_sequences(batch_x_txt) batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len) batch_y = output i+=1 yield ([batch_x_img,batch_x_txt], batch_y ) def create_embedding_matrix(filepath, word_index, embedding_dim): vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index embedding_matrix = np.zeros((vocab_size, embedding_dim)) with open(filepath) as f: count = 0 for line in f: word, *vector = line.split() if word in word_index and count<(len(word_index)-1): idx = word_index[word] embedding_matrix[idx] = np.array( vector, dtype=np.float32)[:embedding_dim] count = count + 1 # errore perché va a splittare e trova to name.domain return embedding_matrix def create_tokens(tokenizer): with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f: data_raw = json.load(f) f.close() tot_txt = [] for i in range(len(data_raw['questions'])): input_txt = data_raw['questions'][i]['question'] tot_txt += [input_txt] tokenizer.fit_on_texts(tot_txt) return tokenizer tokenizer = Tokenizer(num_words=max_words,oov_token = 'OOV') tokenizer = create_tokens(tokenizer) #embedding_matrix = create_embedding_matrix('/kaggle/input/embedding/embedding.txt', tokenizer.word_index, embedding_dim) import os filepath = "../input/glove840b300dtxt/" + os.listdir("../input/glove840b300dtxt/")[0] embedding_matrix = create_embedding_matrix(filepath, tokenizer.word_index, embedding_dim) vocab_size = len(tokenizer.word_index) + 1 reader = data_generator('train')

1条回答

网友

1楼 · 发布于 2024-04-26 14:33:33

我认为这对很多人来说并不有趣，但我发现了错误。在生成器中，我将batch_output += [ output ]从for循环中去掉，for循环在批处理中循环，因此，由于Python的通融，我没有出错，但是批处理的所有问题都有相同的答案，这对于几乎任何问题都是错误的。这使得网络保持在与训练数据中最现有答案相对应的局部最小值。现在，虽然它的性能不好，但它完成了它的工作，在Clevr数据集上获得了~0.5的精确度。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章