2

我正在嘗試創建一個端到端的可訓練離線英文手寫識別模型(無需分割個別字符)。我正在使用IAM手寫數據庫中的單詞數據集進行培訓。TensorFlow:訓練時CTC損失沒有減少BLSTM

我試圖降低學習率,增加批量大小等,但造成的損失不斷沒有/顯著整體下降波動 - TensorBoard visualization for cost at each step

我是新來TensorFlow所以會犯一些幼稚的錯誤。使用的代碼:

class CRNN(object): 

def __init__(self, config): 

    self.config = config 
    tf.reset_default_graph() 

def read_and_decode(self, filename_queue): 

    reader = tf.TFRecordReader() 

    _, serialized_example = reader.read(filename_queue) 

    # Define how to parse the example 
    context_features = { 
     'length': tf.FixedLenFeature([], dtype=tf.int64), 
     'out_length': tf.FixedLenFeature([], dtype=tf.int64) 
    } 
    sequence_features = { 
     'token': tf.FixedLenSequenceFeature([], dtype=tf.float32), 
     'labels': tf.FixedLenSequenceFeature([], dtype=tf.int64) 
    } 

    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
     serialized=serialized_example, 
     context_features=context_features, 
     sequence_features=sequence_features) 

    image = sequence_parsed['token'] 
    label = tf.cast(sequence_parsed['labels'], tf.int32) 
    length = tf.cast(context_parsed['length'], tf.int32) 
    lab_length = tf.cast(context_parsed['out_length'], tf.int32) 

    image_shape = tf.cast(tf.stack([self.config.im_height, 
            length/self.config.im_height]), tf.int32) 
    image = tf.reshape(image, image_shape) 

    # Updating length to represent image width 
    length = tf.shape(image)[1] 

    # Batch the variable length tensor with dynamic padding 
    self.images, self.labels, self.lengths, self.lab_lengths = tf.train.batch(
     tensors=[image, label, length, lab_length], 
     batch_size=self.config.batch_size, dynamic_pad=True) 

def net(self): 


    batch_lab_length = tf.reduce_max(self.lab_lengths) 
    batch_im_length = tf.reduce_max(self.lengths) 

    # Reshape to time major 
    sequences = tf.reshape(self.images, [batch_im_length, self.config.batch_size, 
              self.config.im_height]) 

    # Feed sequences into RNN 
    with tf.name_scope('RNN'): 
     self.cell_fw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden, 
             state_is_tuple=True) 
     self.cell_bw = tf.nn.rnn_cell.LSTMCell(num_units=self.config.rnn_num_hidden, 
             state_is_tuple=True) 
     self.output, self.state = tf.nn.bidirectional_dynamic_rnn(
      cell_fw=self.cell_fw, 
      cell_bw=self.cell_bw, 
      inputs=sequences, 
      dtype=tf.float32, 
      sequence_length=self.lengths, 
      time_major=True, 
      scope='RNN' 
     ) 

     # Reshaping to apply the same weights over the timesteps 
     self.output = tf.reshape(self.output, [-1, self.config.rnn_num_hidden]) 

     self.out_W = tf.Variable(tf.truncated_normal([self.config.rnn_num_hidden, 
               self.config.num_classes], 
               stddev=0.1), name='out_W') 
     self.out_b = tf.Variable(tf.constant(0., shape=[self.config.num_classes]), name='out_b') 

     # Doing the affine projection 
     logits = tf.matmul(self.output, self.out_W) + self.out_b 

    # Reshaping back to the original shape 
    logits = tf.reshape(logits, [self.config.batch_size, -1, self.config.num_classes]) 

    # Time major 
    logits = tf.transpose(logits, (1, 0, 2)) 

    # Training computation 

    # Prepare sparse tensor for CTC loss 
    labs = tf.reshape(self.labels, (self.config.batch_size, batch_lab_length)) 
    sparse_tensor_indices = tf.where(tf.less(tf.cast(0, tf.int32), labs)) 

    labels_vals = tf.reshape(self.labels, [batch_lab_length*self.config.batch_size]) 
    mask = tf.cast(tf.sign(labels_vals), dtype=tf.bool) 
    labels_vals = tf.boolean_mask(labels_vals,mask) 

    labels_sparse = tf.SparseTensor(indices=sparse_tensor_indices, values=labels_vals, 
            dense_shape=[self.config.batch_size, 
               tf.cast(batch_lab_length, tf.int64)]) 
    self.loss = tf.nn.ctc_loss(labels_sparse, logits, sequence_length=self.lab_lengths, 
          preprocess_collapse_repeated=False, ctc_merge_repeated=False, 
          time_major=True) 
    self.cost = tf.reduce_mean(self.loss) 

    # Optimizer 
    self.optimizer = tf.train.MomentumOptimizer(learning_rate=0.01, 
              momentum=0.9, use_nesterov=True).minimize(self.cost) 

    # Predictions for the training, validation, and test data. 
    self.train_prediction = tf.nn.ctc_beam_search_decoder(logits, 
               sequence_length=self.lab_lengths) 


def train(self): 
    num_steps = int((self.config.num_epochs*self.config.sample_size)/self.config.batch_size) 
    tf.reset_default_graph() 

    filename_queue = tf.train.string_input_producer(
        [self.config.tfrecord_filename], num_epochs=self.config.num_epochs) 

    self.read_and_decode(filename_queue) 
    self.net() 

    # The op for initializing the variables. 
    init_op = tf.group(tf.global_variables_initializer(), 
         tf.local_variables_initializer()) 
    saver = tf.train.Saver() 

    with tf.Session() as sess: 

     training_summary = tf.summary.scalar("training_cost", self.cost) 
     writer = tf.summary.FileWriter("./TensorBoard/graph", sess.graph) 

     sess.run(init_op) 
     print('Initialized') 
     coord = tf.train.Coordinator() 
     threads = tf.train.start_queue_runners(coord=coord) 

     start = time.time() 
     steps_time = start 

     epoch = 1 
     for step in range(num_steps): 
      _, c, predictions, actual_labels, train_summ = sess.run([self.optimizer, self.cost, 
                    self.train_prediction, 
                    self.labels, training_summary]) 
      writer.add_summary(train_summ, step) 


      if (step % 10000 == 0): 
       preds = np.zeros((predictions[0][0].dense_shape)) 
       i = 0 
       for idx in predictions[0][0].indices: 
        preds[idx[0]][idx[1]] = predictions[0][0].values[i] 
        i+=1 
       print(time.time() - steps_time) 
       steps_time = time.time() 
       print('Minibatch cost at step %d: %f' % (step, c)) 
       print('Label =', [''.join([char_map_inv[j] for j in i]) for i in actual_labels], 
         'Prediction =', [''.join([char_map_inv[j] for j in i]) for i in preds]) 

      if (step!=0 and step % int(self.config.sample_size/self.config.batch_size) == 0): 
       print('Epoch', epoch, 'Completed') 
       epoch+=1 

      last_step = step 
     saver.save(sess, "model_BLSTM", global_step=last_step) 
     writer.close() 
     print(time.time() - start) 
+0

嘗試設置非常低的學習率(例如1e-6)和動量。您還可以繪製您獲得的漸變色,並且如果看到較大的大小,請嘗試「漸變剪裁」。如果這沒有幫助,您可以嘗試添加tf.add_check_numerics_ops以確保您沒有Inf/NaNs。最後,嘗試從模型中刪除節點,然後重新運行培訓以創建一個可重現問題的小例子(或找到罪魁禍首)。在任何情況下,請詳細說明您嘗試的內容(例如,learning_rate的值)。 – iga

回答

1

的問題是,你是在LSTM送原圖像,所以它是非常困難的它來提取任何有用的信息。 CRNN論文首先使用一系列卷積層從圖像中提取特徵,然後將這些特徵輸入到LSTM中。