2017-08-10 71 views
0

tensorflow多GPU塔錯誤:損失= tower_loss(範圍)。 ValueError異常:可變tower_1 /損失/ xentropy_mean /平均/當我使用多GPU在tensorflow,和錯誤出來如下不存在

Traceback (most recent call last): 
    File "multi_gpu_train.py", line 290, in <module> 
    tf.app.run() 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run 
    _sys.exit(main(_sys.argv[:1] + flags_passthrough)) 
    File "multi_gpu_train.py", line 286, in main 
    train() 
    File "multi_gpu_train.py", line 187, in train 
    loss = tower_loss(scope) 
    File "multi_gpu_train.py", line 94, in tower_loss 
    loss_averages_op = loss_averages.apply(losses + [total_loss]) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/training/moving_averages.py", line 375, in apply 
    colocate_with_primary=(var.op.type in ["Variable", "VariableV2"])) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 174, in create_zeros_slot 
    colocate_with_primary=colocate_with_primary) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 149, in create_slot_with_initializer 
    dtype) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 66, in _create_slot_var 
    validate_shape=validate_shape) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable 
    use_resource=use_resource, custom_getter=custom_getter) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable 
    use_resource=use_resource, custom_getter=custom_getter) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable 
    validate_shape=validate_shape, use_resource=use_resource) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter 
    use_resource=use_resource) 
    File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 682, in _get_single_variable 
    "VarScope?" % name) 
ValueError: Variable tower_1/loss/xentropy_mean/avg/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope? 

和主要功能如下所示,並使用它tower_loss功能

tower_grads = [] 
    for i in xrange(FLAGS.num_gpus): 
     with tf.device('/gpu:%d' % GPU[i]): 
     with tf.name_scope('%s_%d' % (TOWER_NAME, GPU[i])) as scope: 
      # Calculate the loss for one tower of the CIFAR model. This function 
      # constructs the entire CIFAR model but shares the variables across 
      # all towers. 

      loss = tower_loss(scope) 
      # reuse = True 

      # Reuse variables for the next tower. 
      tf.get_variable_scope().reuse_variables() 

      # Retain the summaries from the final tower. 
      summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) 

      # Calculate the gradients for the batch of data on this CIFAR tower. 
      grads = opt.compute_gradients(loss) 

      # Keep track of the gradients across all towers. 
      tower_grads.append(grads) 

    # We must calculate the mean of each gradient. Note that this is the 
    # synchronization point across all towers. 
    grads = average_gradients(tower_grads) 

tower_loss函數如下所示。錯誤信息顯示錯誤在tower_1中出現,並且與tower_0正常。這意味着在第一次迭代

for i in xrange(FLAGS.num_gpus): 

是成功的,我不知道爲什麼。

def tower_loss(scope): 
    """Calculate the total loss on a single tower running the CIFAR model. 

    Args: 
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0' 

    Returns: 
    Tensor of shape [] containing the total loss for a batch of data 
    """ 
    # Get images and labels for CIFAR-10. 
    images, labels = load_train_data.input_pipeline(FLAGS.img_path, FLAGS.label_path, FLAGS.csv_file, FLAGS.batch_size,trainning=True) 
    # Build inference Graph. 
    vgg_net = vgg16.FCN8VGG('./../lane_seg/vgg16.npy') 
    vgg_net.build(images,train=True,debug=False,num_classes=load_train_data.NUM_CLASSES) 

    logits = vgg_net.upscore32 

    # Build the portion of the Graph calculating the losses. Note that we will 
    # assemble the total_loss using a custom function below. 
    labels = tf.squeeze(labels, squeeze_dims=[3]) 
    loss_weights = [0.00588551861547, 0.500363638561, 0.493750842824] 
    _ = weighted_loss(logits=logits,labels=labels,num_classes=load_train_data.NUM_CLASSES,head=loss_weights) 
    # _ = cifar10.loss(logits, labels) 

    # Assemble all of the losses for the current tower only. 
    losses = tf.get_collection('losses', scope) 

    # Calculate the total loss for the current tower. 
    total_loss = tf.add_n(losses, name='total_loss') 

    # Compute the moving average of all individual losses and the total loss. 
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg') 

    loss_averages_op = loss_averages.apply(losses + [total_loss]) 

    # Attach a scalar summary to all individual losses and the total loss; do the 
    # same for the averaged version of the losses. 
    for l in losses + [total_loss]: 
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training 
    # session. This helps the clarity of presentation on tensorboard. 
    loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name) 
    # Name each loss as '(raw)' and name the moving average version of the loss 
    # as the original loss name. 
    tf.summary.scalar(loss_name +' (raw)', l) 
    tf.summary.scalar(loss_name, loss_averages.average(l)) 

    with tf.control_dependencies([loss_averages_op]): 
    total_loss = tf.identity(total_loss) 

    return total_loss 

回答