Tensorflow多GPU性能不好

我們試圖實現塔方法卻發現性能變差： https://github.com/tensorflow/models/tree/master/inception Tensorflow多GPU性能不好

設備：

英特爾
1. 從修改Core i7
2. GTX-1060 x 2

的源代碼：

拆分=無：默認版本
拆分= TRUE：塔式

from tensorflow.python.ops import tensor_array_ops 
 
from tensorflow.python.client import device_lib 
 
import tensorflow as tf 
 
import tflib as lib 
 
import numpy as np 
 
import time 
 
BATCH = 64 
 
DIM = 1000 
 
GPUs = 2 
 

 
Splitting = True 
 

 
def init_matrix(shape): 
 
    return tf.random_normal(shape, stddev=0.1) 
 

 
def Block(param, x, name, reuse): 
 
    W = tf.get_variable('%sweight'%name, [DIM, DIM]) 
 
    b = tf.get_variable('%sbias'%name, [DIM]) 
 
    if not reuse: param.extend([W, b]) 
 

 
    x_ = tf.reshape(x, [-1,DIM]) 
 
    output = tf.nn.sigmoid(tf.matmul(x_, W) + b) 
 
    return tf.reshape(output,[-1,DIM,DIM]) 
 

 
def _tower_loss(param, inputs, reuse=None): 
 
    with tf.variable_scope(tf.get_variable_scope(), reuse=reuse): 
 
    output = Block(param, inputs, 'Layer.0.', reuse) 
 
    output = Block(param, output, 'Layer.1.', reuse) 
 
    output = Block(param, output, 'Layer.2.', reuse) 
 
    output = Block(param, output, 'Layer.3.', reuse) 
 
    output = Block(param, output, 'Layer.4.', reuse) 
 
    output = Block(param, output, 'Layer.5.', reuse) 
 
    output = tf.reshape(output, [-1, DIM*DIM]) 
 
    return tf.reduce_mean(output) 
 

 
def _all_gradients(tower_grads): 
 
    all_grads = [] 
 
    for i in range(len(tower_grads[0])): 
 
    for grad in tower_grads: 
 
     grads = [] 
 
     expanded_g = tf.expand_dims(grad[i], 0) 
 
     grads.append(expanded_g) 
 
    grad = tf.concat(axis=0, values=grads) 
 
    grad = tf.reduce_sum(grad,0) 
 
    all_grads.append(grad) 
 
    return all_grads 
 

 
if not Splitting: 
 
    opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
 
    inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
 

 
    param = [] 
 
    loss = _tower_loss(param, inputs, None) 
 
    grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
 
    apply_gradient_op = opt.apply_gradients(zip(grad, param)) 
 
    merged = tf.summary.merge_all() 
 

 
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
 
    session.run(tf.global_variables_initializer()) 
 
    writer = tf.summary.FileWriter(".", session.graph) 
 
    
 
    for i in range(100): 
 
     start = time.time() 
 
     session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
 
     print 'Iter'+str(i)+': time='+str(time.time()-start) 
 

 
else: 
 
    with tf.Graph().as_default(), tf.device('/cpu:0'): 
 
    opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9) 
 
    
 
    inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM]) 
 
    inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs) 
 

 
    param = [] 
 
    tower_grads = [] 
 
    reuse = None 
 
    for i in range(GPUs): 
 
     with tf.device('/gpu:%d'%i): 
 
     with tf.name_scope('Tower_%d'%i) as scope: 
 
      with tf.device('/cpu:0'): 
 
      loss = _tower_loss(param, inputs_splits[i], reuse) 
 
      reuse = True 
 
      grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0) 
 
      tower_grads.append(grad) 
 
    grads = _all_gradients(tower_grads) 
 
    apply_gradient_op = opt.apply_gradients(zip(grads, param)) 
 
    merged = tf.summary.merge_all() 
 

 
    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session: 
 
     session.run(tf.global_variables_initializer()) 
 
     writer = tf.summary.FileWriter(".", session.graph) 
 
     for i in range(100): 
 
     start = time.time() 
 
     session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])}) 
 
     print 'Iter'+str(i)+': time='+str(time.time()-start)

性能：

默認版本 - 只使用GPU：0

時間= 0.867873907089
塔版本 - 嘗試使用multi-GPU

time = 4.88468384743

我們的問題是：

它顯示了5時間慢與塔方法。我們的實施有什麼問題嗎？
基於本教程，我們將模型保存在CPU中，並將任務分解到不同的GPU。但是我們的GPU通過PCIe相互連接，而不是NVLink。頻繁的數據傳輸成本很高。有沒有其他辦法可以幫助基於PCIe的多GPU？

謝謝。

來源

2017-07-18 Aasta Lin

for i in range(GPUs): 
    with tf.device('/gpu:%d'%i): 
    with tf.name_scope('Tower_%d'%i) as scope: 
     with tf.device('/cpu:0'): ### this line may cause all op allocated on cpu, try remove this line 
     loss = _tower_loss(param, inputs_splits[i], reuse) 
     reuse = True

來源

2017-07-18 06:46:38

刪除'with tf.device（'/ cpu：0'）'確實改善了！謝謝。但是它仍然需要花費大約0.908358812332的時間執行，與單GPU結果類似。你有這個想法嗎？ –

嘗試使用tfprof來分析您的程序。 –

Tensorflow多GPU性能不好

回答

相關問題