0
我們試圖實現塔方法卻發現性能變差: https://github.com/tensorflow/models/tree/master/inceptionTensorflow多GPU性能不好
設備:
- 英特爾
從修改Core i7
- GTX-1060 x 2
的源代碼:
拆分=無:默認版本
拆分= TRUE:塔式
from tensorflow.python.ops import tensor_array_ops
from tensorflow.python.client import device_lib
import tensorflow as tf
import tflib as lib
import numpy as np
import time
BATCH = 64
DIM = 1000
GPUs = 2
Splitting = True
def init_matrix(shape):
return tf.random_normal(shape, stddev=0.1)
def Block(param, x, name, reuse):
W = tf.get_variable('%sweight'%name, [DIM, DIM])
b = tf.get_variable('%sbias'%name, [DIM])
if not reuse: param.extend([W, b])
x_ = tf.reshape(x, [-1,DIM])
output = tf.nn.sigmoid(tf.matmul(x_, W) + b)
return tf.reshape(output,[-1,DIM,DIM])
def _tower_loss(param, inputs, reuse=None):
with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
output = Block(param, inputs, 'Layer.0.', reuse)
output = Block(param, output, 'Layer.1.', reuse)
output = Block(param, output, 'Layer.2.', reuse)
output = Block(param, output, 'Layer.3.', reuse)
output = Block(param, output, 'Layer.4.', reuse)
output = Block(param, output, 'Layer.5.', reuse)
output = tf.reshape(output, [-1, DIM*DIM])
return tf.reduce_mean(output)
def _all_gradients(tower_grads):
all_grads = []
for i in range(len(tower_grads[0])):
for grad in tower_grads:
grads = []
expanded_g = tf.expand_dims(grad[i], 0)
grads.append(expanded_g)
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_sum(grad,0)
all_grads.append(grad)
return all_grads
if not Splitting:
opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])
param = []
loss = _tower_loss(param, inputs, None)
grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
apply_gradient_op = opt.apply_gradients(zip(grad, param))
merged = tf.summary.merge_all()
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
session.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(".", session.graph)
for i in range(100):
start = time.time()
session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
print 'Iter'+str(i)+': time='+str(time.time()-start)
else:
with tf.Graph().as_default(), tf.device('/cpu:0'):
opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])
inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs)
param = []
tower_grads = []
reuse = None
for i in range(GPUs):
with tf.device('/gpu:%d'%i):
with tf.name_scope('Tower_%d'%i) as scope:
with tf.device('/cpu:0'):
loss = _tower_loss(param, inputs_splits[i], reuse)
reuse = True
grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
tower_grads.append(grad)
grads = _all_gradients(tower_grads)
apply_gradient_op = opt.apply_gradients(zip(grads, param))
merged = tf.summary.merge_all()
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
session.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(".", session.graph)
for i in range(100):
start = time.time()
session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
print 'Iter'+str(i)+': time='+str(time.time()-start)
性能:
默認版本 - 只使用GPU:0
時間= 0.867873907089
塔版本 - 嘗試使用multi-GPU
time = 4.88468384743
它顯示了5時間慢與塔方法。我們的實施有什麼問題嗎?
基於本教程,我們將模型保存在CPU中,並將任務分解到不同的GPU。但是我們的GPU通過PCIe相互連接,而不是NVLink。頻繁的數據傳輸成本很高。有沒有其他辦法可以幫助基於PCIe的多GPU?
我們的問題是:
謝謝。
刪除'with tf.device('/ cpu:0')'確實改善了!謝謝。但是它仍然需要花費大約0.908358812332的時間執行,與單GPU結果類似。你有這個想法嗎? –
嘗試使用tfprof來分析您的程序。 –