我正在尝试使用Clevr数据集(https://cs.stanford.edu/people/jcjohns/clevr/)为深度学习作业构建一个可视化的问答模型(不过,我不能使用函数式程序表示)。你知道吗
然而,我很挣扎,因为我的网络没有正确地学习,因为它在训练和验证集上继续在0.2的精确度上振荡。而且,我第一次运行它时,它的精确度已经提高到了0.4,具有类似的体系结构,但是,我怀疑,一些不同的东西,从嵌入矩阵构造和标记器的试验中保留在内存中。你知道吗
我已经尝试过改变嵌入(现在我正在使用手套),改变尺寸,以多种方式改变网络(我还需要尝试注意力和更高级的东西,但首先我希望看到它正常工作)。 我确信存在某种致命的错误(架构也很幼稚),但我似乎无法发现它。你能帮我理解什么不起作用吗?你知道吗
我会留下我的网络和数据输入管道下面的代码,请评论,甚至指出我在哪里使用一些不好的做法。我很抱歉,如果我会留下很多代码,但我真的不明白我哪里做错了。你知道吗
先谢谢你。你知道吗
这是网络的密码
import tensorflow as tf
batch_size = 8
epochs = 100
#arch = tf.keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
arch = tf.keras.applications.densenet.DenseNet201(include_top=False, weights='imagenet', input_shape=(img_h, img_w, 3))
freeze_until = 650
for layer in arch.layers[:freeze_until]:
layer.trainable = False
branch1 = arch.output
branch1 = tf.keras.layers.GlobalAveragePooling2D(data_format=None) (branch1)
text_inputs = tf.keras.Input(shape=[max_words])
emb = tf.keras.layers.Embedding(vocab_size,embedding_dim,
input_length=max_words,
weights=[embedding_matrix],
trainable=False) (text_inputs)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(emb)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
#branch2 = tf.keras.layers.Conv1D(128, 5, activation='relu')(emb)
branch2 = tf.keras.layers.LSTM(128)(emb)
#branch2 = tf.keras.layers.GlobalMaxPool1D()(branch2)
#branch2 = tf.keras.layers.Dense(256, activation='relu')(branch2)
joint = tf.keras.layers.concatenate([branch1, branch2])
joint = tf.keras.layers.Dense(512, activation='relu')(joint)
joint = tf.keras.layers.Dropout(0.2)(joint)
predictions = tf.keras.layers.Dense(num_classes, activation='softmax')(joint)
model = tf.keras.models.Model(inputs=[arch.input, text_inputs], outputs=[predictions])
model.summary()
loss = tf.keras.losses.CategoricalCrossentropy()
lr = 5e-4
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
model.compile(loss = loss,
optimizer = optimizer,
metrics = ['accuracy'])
callbacks=[]
callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True))
callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0))
model.fit_generator(data_generator('train'),validation_data = data_generator('validation'), steps_per_epoch= 240, validation_steps = 120, epochs=epochs, callbacks=callbacks, verbose=1)
下面是生成器+嵌入的代码
import json
import random
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
img_h = 320
img_w = 480
max_words = 100
embedding_dim = 40
num_classes = 13
val_split = 0.8
max_len = 25
classes = [ '0',
'1',
'10',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
'no',
'yes'
]
label_encoder = LabelEncoder()
integer_encoder_ = label_encoder.fit(classes)
integer_encoded = integer_encoder_.transform(classes)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoder_ = onehot_encoder.fit(integer_encoded)
def data_generator(mode, batch_size = 8):
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
while True:
# Select files (paths/indices) for the batch
if mode == 'validation':
batch_addresses = random.sample(range(int(len(data_raw['questions'])*val_split),len(data_raw['questions'])), batch_size)
elif mode == 'train':
batch_addresses = random.sample(range(0, int(len(data_raw['questions'])*val_split)), batch_size)
else:
batch_addresses = random.sample(range(0, len(data_raw['questions'])), batch_size)
batch_input_img = []
batch_input_txt = []
batch_output = []
for i in batch_addresses:
image_name = data_raw['questions'][i]['image_filename']
img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train/' + image_name).convert('RGB')
img_array = np.array(img)
img_array = np.expand_dims(img_array, 0)
input_img = np.true_divide(img_array,255)
input_txt = data_raw['questions'][i]['question']
output = data_raw['questions'][i]['answer']
batch_input_img += [ input_img ]
batch_input_txt += [ input_txt ]
# Return a tuple of (input,output) to feed the network
batch_x_img = np.array( batch_input_img )
batch_x_txt = np.array( batch_input_txt )
batch_x_img = batch_x_img[:,-1]
tokenized = tokenizer.texts_to_sequences(batch_x_txt)
batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len)
batch_output += [ output ]
batch_y = np.array( batch_output )
y_c = integer_encoder_.transform(batch_y)
y_c = y_c.reshape(len(y_c), 1)
onehot_encoded = onehot_encoder_.transform(y_c)
batch_y = onehot_encoded
yield ([batch_x_img,batch_x_txt], batch_y )
def test_generator():
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
i = 0
while (i<=len(data_raw['questions'])):
batch_input_img = []
batch_input_txt = []
batch_output = []
image_name = data_raw['questions'][i]['image_filename']
img = Image.open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/test/' + image_name).convert('RGB')
img_array = np.array(img)
img_array = np.expand_dims(img_array, 0)
input_img = np.true_divide(img_array,255)
input_txt = data_raw['questions'][i]['question']
output = data_raw['questions'][i]['question_id']
batch_input_img += [ input_img ]
batch_input_txt += [ input_txt ]
# Return a tuple of (input,output) to feed the network
batch_x_img = np.array( batch_input_img )
batch_x_txt = np.array( batch_input_txt )
batch_x_img = batch_x_img[:,-1]
tokenized = tokenizer.texts_to_sequences(batch_x_txt)
batch_x_txt = pad_sequences(tokenized, padding='post', maxlen=max_len)
batch_y = output
i+=1
yield ([batch_x_img,batch_x_txt], batch_y )
def create_embedding_matrix(filepath, word_index, embedding_dim):
vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open(filepath) as f:
count = 0
for line in f:
word, *vector = line.split()
if word in word_index and count<(len(word_index)-1):
idx = word_index[word]
embedding_matrix[idx] = np.array(
vector, dtype=np.float32)[:embedding_dim]
count = count + 1
# errore perché va a splittare e trova to name.domain
return embedding_matrix
def create_tokens(tokenizer):
with open('/kaggle/input/ann-and-dl-vqa/dataset_vqa/train_data.json', 'r') as f:
data_raw = json.load(f)
f.close()
tot_txt = []
for i in range(len(data_raw['questions'])):
input_txt = data_raw['questions'][i]['question']
tot_txt += [input_txt]
tokenizer.fit_on_texts(tot_txt)
return tokenizer
tokenizer = Tokenizer(num_words=max_words,oov_token = 'OOV')
tokenizer = create_tokens(tokenizer)
#embedding_matrix = create_embedding_matrix('/kaggle/input/embedding/embedding.txt', tokenizer.word_index, embedding_dim)
import os
filepath = "../input/glove840b300dtxt/" + os.listdir("../input/glove840b300dtxt/")[0]
embedding_matrix = create_embedding_matrix(filepath, tokenizer.word_index, embedding_dim)
vocab_size = len(tokenizer.word_index) + 1
reader = data_generator('train')
PS我原以为将GlobalAveragePooling层改为平坦层可以解决这个问题,但事实并非如此
我认为这对很多人来说并不有趣,但我发现了错误。在生成器中,我将
batch_output += [ output ]
从for循环中去掉,for循环在批处理中循环,因此,由于Python的通融,我没有出错,但是批处理的所有问题都有相同的答案,这对于几乎任何问题都是错误的。这使得网络保持在与训练数据中最现有答案相对应的局部最小值。 现在,虽然它的性能不好,但它完成了它的工作,在Clevr数据集上获得了~0.5的精确度。你知道吗相关问题 更多 >
编程相关推荐