用CNTK实现CRNN的问题

alphabet = "0123456789abcdefghijklmnopqrstuvwxyz" input_dim_model = (1, 32, 96) # images are 96 x 32 with 1 channel of color (gray) num_output_classes = len(alphabet) + 1 ltsm_hidden = 256 def bidirectionalLTSM(features, nHidden, nOut): a = C.layers.Recurrence(C.layers.LSTM(nHidden))(features) b = C.layers.Recurrence(C.layers.LSTM(nHidden), go_backwards=True)(features) c = C.splice(a, b) r = C.layers.Dense(nOut)(c) return r def create_model_rnn(features): h = features h = bidirectionalLTSM(h, ltsm_hidden, ltsm_hidden) h = bidirectionalLTSM(h, ltsm_hidden, num_output_classes) return h def create_model_cnn(features): with C.layers.default_options(init=C.glorot_uniform(), activation=C.relu): h = features h = C.layers.Convolution2D(filter_shape=(3,3), num_filters=64, strides=(1,1), pad=True, name='conv_0')(h) #more layers... h = C.layers.BatchNormalization(name="batchnorm_6")(h) return h x = C.input_variable(input_dim_model, name="x") label = C.sequence.input((num_output_classes), name="y") def create_model(features): #Composite(x: Tensor[1,32,96]) -> Tensor[512,1,23] a = create_model_cnn(features) a = C.reshape(a, (512, 23)) #Composite(x: Tensor[1,32,96]) -> Tensor[23,512] a = C.swapaxes(a, 0, 1) #is there a better way to convert to sequence and still be compatible with forward_backwards() ? #Composite(x: Tensor[1,32,96], y: Sequence[Tensor[37]]) -> Sequence[Tensor[512]] a = C.to_sequence_like(a, label) #Composite(x: Tensor[1,32,96], y: Sequence[Tensor[37]]) -> Sequence[Tensor[37]] a = create_model_rnn(a) return a #Composite(x: Tensor[1,32,96], y: Sequence[Tensor[37]]) -> Sequence[Tensor[37]] z = create_model(x) #LabelsToGraph(y: Sequence[Tensor[37]]) -> Sequence[Tensor[37]] graph = C.labels_to_graph(label) #Composite(y: Sequence[Tensor[37]], x: Tensor[1,32,96]) -> np.float32 criteria = C.forward_backward(C.labels_to_graph(label), z, blankTokenId=0) err = C.edit_distance_error(z, label, squashInputs=True, tokensToIgnore=[0]) lr = C.learning_rate_schedule(0.01, C.UnitType.sample) learner = C.adadelta(z.parameters, lr) progress_printer = C.logging.progress_print.ProgressPrinter(50, first=10, tag='Training') trainer = C.Trainer(z, (criteria, err), learner, progress_writers=[progress_printer]) #some more custom code ... #below is how I'm feeding the data while True: x1, y1 = custom_datareader.next_minibatch() #x1 is a list of numpy arrays containing training images #y1 is a list of numpy arrays with one hot encoded labels trainer.train_minibatch({x: x1, label: y1})

2条回答

网友

1楼 · 编辑于 2024-05-16 09:31:05

在CNTK中，有很多事情使得训练CRNN模型变得困难（标签格式的正确方法很棘手，整个标签图的转换，没有转录误差度量等）。以下是正确工作的模型实现：

https://github.com/BenjaminTrapani/SceneTextOCR/tree/master

它依靠CNTK的一个分支来修复图像读取器的错误，提供一个转录错误函数，并提高文本格式读取器的性能。它还提供了一个从mjsynth数据集生成文本格式标签的应用程序。以下是如何设置标签格式以供参考：

513528 |textLabel 7:2
513528 |textLabel 26:1
513528 |textLabel 0:2
513528 |textLabel 26:1
513528 |textLabel 20:2
513528 |textLabel 26:1
513528 |textLabel 11:2
513528 |textLabel 26:1
513528 |textLabel 8:2
513528 |textLabel 26:1
513528 |textLabel 4:2
513528 |textLabel 26:1
513528 |textLabel 17:2
513528 |textLabel 26:1
513528 |textLabel 18:2
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1
513528 |textLabel 26:1

513528是序列ID，应该与同一样本对应的图像数据序列ID匹配。textLabel用于为minibatch源创建流。在C++中创建如下流：

^{pr2}$

^ {CD3>}是CTC解码的空白字符的索引。“：”前面的其他值是标签的字符代码。1是对序列中的每个向量进行1-热编码。有一堆尾随的空白字符，以确保序列与支持的最大序列长度一样长，因为在编写本文时，CTC丢失函数实现不支持可变长度序列。在

网友

2楼 · 编辑于 2024-05-16 09:31:05

CNTK学习者默认使用聚合梯度来适应不同小批量规模的分布式训练。然而，聚合梯度对于像adadelta这样的adagrad风格的学习者并不一样。请尝试使用_mean_gradient=True：

learner = C.adadelta(z.parameters, lr, use_mean_gradient=True)

相关问题更多 >

编程相关推荐

热门问题

热门文章