如何在訓練深度網絡時有效使用多個GPU？

我正在使用具有2個GPU Titan Black的機器來訓練我的3層（3x3,3x3和5x5）的深度學習模型。如何在訓練深度網絡時有效使用多個GPU？

訓練運行得非常好，但是當我觀看nvidia-smi（每1秒觀看一次）時，我意識到我的程序只使用一個GPU進行計算，即使第一個達到100％，第二個總是0％。

我想使用tf.device分配具體任務爲他們每個人但後來他們運行一個接一個，而不是平行的，和總時間，甚至增加，沒有減少（我猜是因爲2個GPU必須互相交換價值）

以下是我的程序。這是相當混亂，也許你只需要注意在我使用的圖tf.device就夠了...

非常感謝你！

import tensorflow as tf 
import numpy as np 
from six.moves import cPickle as pickle 
import matplotlib.pyplot as plt 

from os import listdir, sys 
from os.path import isfile, join 

from time import gmtime, strftime 
import time 

def validatePath(path): 
    path = path.replace("\\","/") 
    if (path[len(path)-1] != "/"): 
     path = path + "/" 
    return path 

hidden_size_default = np.array([16, 32, 64, 32]) 
cnn1_default = 3 
cnn2_default = 3 
cnn3_default = 5 

SIZE_BATCH_VALID = 200 

input_path = 'ARCHIVES-sub-dataset' 

output_path = 'ARCHIVES-model' 

log_address = "trainlog.txt" 

tf.app.flags.DEFINE_integer('h0', hidden_size_default[0], 'Size of hidden layer 0th') 
tf.app.flags.DEFINE_integer('h1', hidden_size_default[1], 'Size of hidden layer 1st') 
tf.app.flags.DEFINE_integer('h2', hidden_size_default[2], 'Size of hidden layer 2nd') 
tf.app.flags.DEFINE_integer('h3', hidden_size_default[3], 'Size of hidden layer 3rd') 

tf.app.flags.DEFINE_integer('k1', cnn1_default , 'Size of kernel 1st') 
tf.app.flags.DEFINE_integer('k2', cnn2_default , 'Size of kernel 2nd') 
tf.app.flags.DEFINE_integer('k3', cnn3_default , 'Size of kernel 3rd') 

tf.app.flags.DEFINE_string('input_path', input_path, 'The parent directory which contains 2 directories: dataset and label') 
tf.app.flags.DEFINE_string('output_path', output_path, 'The directory which will store models (you have to create)') 
tf.app.flags.DEFINE_string('log_address', log_address, 'The file name which will store the log') 

FLAGS = tf.app.flags.FLAGS 

load_path = FLAGS.input_path 
save_model_path = FLAGS.output_path 

log_addr = FLAGS.log_address 

load_path = validatePath(load_path) 
save_model_path = validatePath(save_model_path) 

cnn1 = FLAGS.k1 
cnn2 = FLAGS.k2 
cnn3 = FLAGS.k3 

hidden_size = np.array([FLAGS.h0, FLAGS.h1, FLAGS.h2, FLAGS.h3]) 

# Shuffle the dataset and its label 
def randomize(dataset, labels): 
    permutation = np.random.permutation(labels.shape[0]) 
    shuffled_dataset = dataset[permutation,:] 
    shuffled_labels = labels[permutation] 
    return shuffled_dataset, shuffled_labels 


def writemyfile(mystring): 
    with open(log_addr, "a") as myfile: 
     myfile.write(str(mystring + "\n")) 

num_labels = 5 

def accuracy(predictions, labels): 
    return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))/ predictions.shape[0]) 

def weight_variable(shape): 
    initial = tf.truncated_normal(shape, stddev=0.1) 
    return tf.Variable(initial) 
def bias_variable(shape): 
    initial = tf.constant(0.1, shape=shape) 
    return tf.Variable(initial) 
def conv2d(x, W): 
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 
def max_pool_2x2(x): 
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 

def DivideSets(input_set): 
    length_set = input_set.shape[0] 
    index_70 = int(length_set*0.7) 
    index_90 = int(length_set*0.9) 

    set_train = input_set[0:index_70] 
    set_valid = input_set[index_70:index_90] 
    set_test = input_set[index_90:length_set] 
    return np.float32(set_train), np.float32(set_valid), np.float32(set_test) 

# from 1-value labels to 5 values of (0 and 1) 
def LabelReconstruct(label_set): 
    label_set = label_set.astype(int) 
    new_label_set = np.zeros(shape=(len(label_set),num_labels)) 
    for i in range(len(label_set)): 
     new_label_set[i][label_set[i]] = 1 
    return new_label_set.astype(int) 

def LoadDataSet(load_path): 
    list_data = [f for f in listdir(load_path + "dataset/") if isfile(join(load_path + "dataset/", f))] 
    list_label = [f for f in listdir(load_path + "label/") if isfile(join(load_path + "dataset/", f))] 
    if list_data.sort() == list_label.sort():  
     return list_data 
    else: 
     print("data and labels are not suitable") 
     return 0 

# load, randomize, normalize images and reconstruct labels 
def PrepareData(*arg): 
    filename = arg[0] 
    loaded_dataset = pickle.load(open(load_path + "dataset/" + filename, "rb")) 
    loaded_labels = pickle.load(open(load_path + "label/" + filename, "rb")) 
    if len(arg) == 1: 
     datasize = len(loaded_labels) 
    elif len(arg) == 2: 
     datasize = int(arg[1]) 
    else: 
     print("not more than 2 arguments please!") 
    dataset_full,labels_full = randomize(loaded_dataset[0:datasize], loaded_labels[0:datasize]) 
    return NormalizeData(dataset_full), LabelReconstruct(labels_full) 

def NormalizeData(dataset): 
    dataset = dataset - (dataset.mean()) 
    dataset = dataset/(dataset.std()) 
    return dataset 

### LOAD DATA 
listfiles = LoadDataSet(load_path) 

# divide 
listfiles_train = listfiles[0:15] 
listfiles_valid = listfiles[15:25] 
listfiles_test = listfiles[25:len(listfiles)] 


graphCNN = tf.Graph() 

with graphCNN.as_default(): 
    with tf.device('/gpu:0'): 

     x = tf.placeholder(tf.float32, shape=(None, 224,224,3)) # X 
     y_ = tf.placeholder(tf.float32, shape=(None, num_labels)) # Y_ 

     dropout = tf.placeholder(tf.float32) 
     if dropout == 1.0: 
      keep_prob = tf.constant([0.2, 0.3, 0.5], dtype=tf.float32) 
     else: 
      keep_prob = tf.constant([1.0, 1.0, 1.0], dtype=tf.float32) 

     weights_1 = weight_variable([cnn1,cnn1,3, hidden_size[0]]) 
     biases_1 = bias_variable([hidden_size[0]]) 
     weights_2 = weight_variable([cnn2,cnn2,hidden_size[0], hidden_size[1]]) 
     biases_2 = bias_variable([hidden_size[1]]) 
     weights_3 = weight_variable([cnn3,cnn3,hidden_size[1], hidden_size[2]]) 
     biases_3 = bias_variable([hidden_size[2]]) 
     weights_4 = weight_variable([56 * 56 * hidden_size[2], hidden_size[3]]) 
     biases_4 = bias_variable([hidden_size[3]]) 
     weights_5 = weight_variable([hidden_size[3], num_labels]) 
     biases_5 = bias_variable([num_labels]) 

     def model(data): 
      with tf.device('/gpu:1'): 
       train_hidden_1 = tf.nn.relu(conv2d(data, weights_1) + biases_1) 
       train_hidden_2 = max_pool_2x2(tf.nn.relu(conv2d(train_hidden_1, weights_2) + biases_2)) 
       train_hidden_2_drop = tf.nn.dropout(train_hidden_2, keep_prob[0]) 

       train_hidden_3 = max_pool_2x2(tf.nn.relu(conv2d(train_hidden_2_drop, weights_3) + biases_3)) 
       train_hidden_3_drop = tf.nn.dropout(train_hidden_3, keep_prob[1]) 
       train_hidden_3_drop = tf.reshape(train_hidden_3_drop,[-1, 56 * 56 * hidden_size[2]]) 

       train_hidden_4 = tf.nn.relu(tf.matmul(train_hidden_3_drop, weights_4) + biases_4) 
       train_hidden_4_drop = tf.nn.dropout(train_hidden_4, keep_prob[2]) 

       logits = tf.matmul(train_hidden_4_drop, weights_5) + biases_5 
      return logits 

     t_train_labels = tf.argmax(y_, 1) # From one-hot (one and zeros) vectors to values  
     loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model(x), labels=t_train_labels)) 

     optimizer = tf.train.AdamOptimizer(0.01).minimize(loss) 

     y = tf.nn.softmax(model(x)) 

### RUNNING 

print("log address: %s" % (log_addr)) 

#num_steps = 10001 
times_repeat = 20 # number of epochs 
batch_size = 100 

with tf.Session(graph=graphCNN,config=tf.ConfigProto(log_device_placement=True)) as session: 
    tf.initialize_all_variables().run() 
    saver = tf.train.Saver(max_to_keep=0) 

    writemyfile("---ARCHIVES_M1----") 
    mytime = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
    writemyfile(str("\nTime: %s \nLayers: %d,%d,%d \epochs: %d" % (mytime,cnn1,cnn2,cnn3,times_repeat))) 

    writemyfile("Train files:" + str(listfiles_train)) 
    writemyfile("Valid files:" + str(listfiles_valid)) 
    writemyfile("Test files:" + str(listfiles_test)) 

    print("Model will be saved in file: %s" % save_model_path) 
    writemyfile(str("Model will be saved in file: %s" % save_model_path)) 

    ### TRAINING & VALIDATION 
    valid_accuracies_epochs = np.array([]) 
    for time_repeat in range(times_repeat): 
     print("- time_repeat:",time_repeat) 
     writemyfile("- time_repeat:"+str(time_repeat)) 

     for file_train in listfiles_train: 
      file_train_id = int(file_train[0:len(file_train)-4]) 

      time_start_this_file = time.time() 

      #LOAD DATA 
      print("- - file:",file_train_id, end=' ') 
      writemyfile("- - file:" + str(file_train_id)) 

      Data_train, Label_train= PrepareData(file_train) 

      for step in range(0,len(Data_train)-batch_size,batch_size): 
       batch_data = Data_train[step:step+batch_size] 
       batch_labels = Label_train[step:step+batch_size] 
       feed_dict = {x : batch_data, y_ : batch_labels, dropout: 1.0} 
       opti, l, predictions = session.run([optimizer, loss, y], feed_dict=feed_dict) 

      train_accuracies = np.array([]) 
      for index_tr_accu in range(0,len(Data_train)-SIZE_BATCH_VALID,SIZE_BATCH_VALID): 
       current_predictions = y.eval(feed_dict={x: Data_train[index_tr_accu:index_tr_accu+SIZE_BATCH_VALID],dropout: 0.0}) 
       current_accuracy = accuracy(current_predictions, Label_train[index_tr_accu:index_tr_accu+SIZE_BATCH_VALID]) 
       train_accuracies = np.r_[train_accuracies,current_accuracy] 

      train_accuracy = train_accuracies.mean()      
      print("batch accu: %.2f%%" %(train_accuracy),end=" | ") 
      writemyfile("batch accu: %.2f%%" %(train_accuracy)) 

      time_done_this_file = time.time() - time_start_this_file 
      print("time: %.2fs" % (time_done_this_file)) 
      writemyfile("time: %.2fs" % (time_done_this_file)) 

     # save model 
     model_addr = save_model_path + "model335" + "-epoch-" + str(time_repeat) + ".ckpt" 
     save_path = saver.save(session, model_addr,) # max_to_keep default was 5 

     mytime = strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 
     print("epoch finished at %s \n model address: %s" % (mytime,model_addr)) 
     writemyfile("epoch finished at %s \n model address: %s" % (mytime,model_addr)) 

     # validation 
     valid_accuracies = np.array([]) 
     for file_valid in listfiles_valid: 
      file_valid_id = int(file_valid[0:len(file_valid)-4]) 
      Data_valid, Label_valid = PrepareData(file_valid) 
      for index_vl_accu in range(0,len(Data_valid)-SIZE_BATCH_VALID,SIZE_BATCH_VALID): 
       current_predictions = y.eval(feed_dict={x: Data_valid[index_vl_accu:index_vl_accu+SIZE_BATCH_VALID],dropout: 0.0}) 
       current_accuracy = accuracy(current_predictions, Label_valid[index_vl_accu:index_vl_accu+SIZE_BATCH_VALID]) 
       valid_accuracies = np.r_[valid_accuracies,current_accuracy] 

     valid_accuracy = valid_accuracies.mean() 
     print("epoch %d - valid accu: %.2f%%" %(time_repeat,valid_accuracy)) 
     writemyfile("epoch %d - valid accu: %.2f%%" %(time_repeat,valid_accuracy)) 

     valid_accuracies_epochs = np.hstack([valid_accuracies_epochs,valid_accuracy]) 

print('Done!!') 
writemyfile(str('Done!!')) 
session.close()

更新：我發現cifar10_multi_gpu_train.py似乎是訓練多GPU的一個很好的例子，但老實說，我不知道如何在我的案件，不適用。

來源

2017-04-05 Tai Christian

是您在運行cifar10_multi_gpu_train.py時使用的GPU嗎？ – Anton

我試着運行它，但導入模型時出現錯誤：ModuleNotFoundError：沒有名爲'tensorflow.models'的模塊 –

我認爲你需要改變

def model(data): 
    with tf.device('/gpu:1'):

：以第一with tf.device...你只是做變量初始化然後

def model(data): 
    for d in ['/gpu:0', '/gpu:1']: 
     with tf.device(d):

和溝線with tf.device('/gpu:0'):

由於您正在重新設置下一個with tf.device的設備。

讓我知道這是否工作，因爲我無法測試它。

來源

2017-04-05 16:21:41 Anton

謝謝@Anton，當我讀[this]時試過那個（https://www.tensorflow.org/tutorials/using_gpu＃using_multiple_gpus），但只有第一個GPU工作。 –

對不起，實際上這兩個GPU都在運行。我沒有意識到，但批量培訓所用的時間與使用1個GPU時相同。 –

你可以嘗試使用在張量流之上的抽象的keras。這是我更喜歡使用的，代碼比tensorflow更簡潔。在Keras中使用多GPU是非常容易的，只需使用jonilaserson提到的腳本[https://github.com/fchollet/keras/issues/2436] – Anton

如何在訓練深度網絡時有效使用多個GPU？

回答

相關問題