2017-07-18 132 views
0

我們試圖實現塔方法卻發現性能變差: https://github.com/tensorflow/models/tree/master/inceptionTensorflow多GPU性能不好

  • 設備:

    • 英特爾

      1. 從修改Core i7

      2. GTX-1060 x 2
  • 的源代碼:

    • 拆分=無:默認版本

    • 拆分= TRUE:塔式

  • from tensorflow.python.ops import tensor_array_ops 
     
    from tensorflow.python.client import device_lib 
     
    import tensorflow as tf 
     
    import tflib as lib 
     
    import numpy as np 
     
    import time 
     
    BATCH = 64 
     
    DIM = 1000 
     
    GPUs = 2 
     
    
     
    Splitting = True 
     
    
     
    def init_matrix(shape): 
     
        return tf.random_normal(shape, stddev=0.1) 
     
    
     
    def Block(param, x, name, reuse): 
     
        W = tf.get_variable('%sweight'%name, [DIM, DIM]) 
     
        b = tf.get_variable('%sbias'%name, [DIM]) 
     
        if not reuse: param.extend([W, b]) 
     
    
     
        x_ = tf.reshape(x, [-1,DIM]) 
     
        output = tf.nn.sigmoid(tf.matmul(x_, W) + b) 
     
        return tf.reshape(output,[-1,DIM,DIM]) 
     
    
     
    def _tower_loss(param, inputs, reuse=None): 
     
        with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): 
     
        output = Block(param, inputs, 'Layer.0.', reuse) 
     
        output = Block(param, output, 'Layer.1.', reuse) 
     
        output = Block(param, output, 'Layer.2.', reuse) 
     
        output = Block(param, output, 'Layer.3.', reuse) 
     
        output = Block(param, output, 'Layer.4.', reuse) 
     
        output = Block(param, output, 'Layer.5.', reuse) 
     
        output = tf.reshape(output, [-1, DIM*DIM]) 
     
        return tf.reduce_mean(output) 
     
    
     
    def _all_gradients(tower_grads): 
     
        all_grads = [] 
     
        for i in range(len(tower_grads[0])): 
     
        for grad in tower_grads: 
     
         grads = [] 
     
         expanded_g = tf.expand_dims(grad[i], 0) 
     
         grads.append(expanded_g) 
     
        grad = tf.concat(axis=0, values=grads) 
     
        grad = tf.reduce_sum(grad,0) 
     
        all_grads.append(grad) 
     
        return all_grads 
     
    
     
    if not Splitting: 
     
        opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
     
        inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
     
    
     
        param = [] 
     
        loss = _tower_loss(param, inputs, None) 
     
        grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
     
        apply_gradient_op = opt.apply_gradients(zip(grad, param)) 
     
        merged = tf.summary.merge_all() 
     
    
     
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
     
        session.run(tf.global_variables_initializer()) 
     
        writer = tf.summary.FileWriter(".", session.graph) 
     
        
     
        for i in range(100): 
     
         start = time.time() 
     
         session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
     
         print 'Iter'+str(i)+': time='+str(time.time()-start) 
     
    
     
    else: 
     
        with tf.Graph().as_default(), tf.device('/cpu:0'): 
     
        opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
     
        
     
        inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
     
        inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs) 
     
    
     
        param = [] 
     
        tower_grads = [] 
     
        reuse = None 
     
        for i in range(GPUs): 
     
         with tf.device('/gpu:%d'%i): 
     
         with tf.name_scope('Tower_%d'%i) as scope: 
     
          with tf.device('/cpu:0'): 
     
          loss = _tower_loss(param, inputs_splits[i], reuse) 
     
          reuse = True 
     
          grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
     
          tower_grads.append(grad) 
     
        grads = _all_gradients(tower_grads) 
     
        apply_gradient_op = opt.apply_gradients(zip(grads, param)) 
     
        merged = tf.summary.merge_all() 
     
    
     
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
     
         session.run(tf.global_variables_initializer()) 
     
         writer = tf.summary.FileWriter(".", session.graph) 
     
         for i in range(100): 
     
         start = time.time() 
     
         session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
     
         print 'Iter'+str(i)+': time='+str(time.time()-start)

  • 性能:

    • 默認版本 - 只使用GPU:0

      時間= 0.867873907089

    • 塔版本 - 嘗試使用multi-GPU

      time = 4.88468384743

  • 我們的問題是:

    1. 它顯示了5時間慢與塔方法。我們的實施有什麼問題嗎?

    2. 基於本教程,我們將模型保存在CPU中,並將任務分解到不同的GPU。但是我們的GPU通過PCIe相互連接,而不是NVLink。頻繁的數據傳輸成本很高。有沒有其他辦法可以幫助基於PCIe的多GPU?

    謝謝。

    回答

    0
    for i in range(GPUs): 
        with tf.device('/gpu:%d'%i): 
        with tf.name_scope('Tower_%d'%i) as scope: 
         with tf.device('/cpu:0'): ### this line may cause all op allocated on cpu, try remove this line 
         loss = _tower_loss(param, inputs_splits[i], reuse) 
         reuse = True 
    
    +0

    刪除'with tf.device('/ cpu:0')'確實改善了!謝謝。但是它仍然需要花費大約0.908358812332的時間執行,與單GPU結果類似。你有這個想法嗎? –

    +0

    嘗試使用tfprof來分析您的程序。 –