多GPU CUDA推力

我有一個使用推力目前正常在單GPU工作的一個CUDA C++代碼。我現在想修改它爲多GPU。我有一個主機功能，其中包括一些Thrust調用，用於對設備陣列進行排序，複製和計算差異等。我想要使用每個GPU同時在自己的（獨立）陣列集上運行這個Thrust調用序列。我讀過Thrust函數的返回值是同步的，但我可以使用OpenMP讓每個主機線程調用一個在獨立GPU上運行的函數（使用Thrust調用）？多GPU CUDA推力

例如（在瀏覽器編碼）：

#pragma omp parallel for 
for (int dev=0; dev<Ndev; dev++){ 
    cudaSetDevice(dev); 
    runthrustfunctions(dev); 
} 

void runthrustfunctions(int dev){ 
    /*lots of Thrust functions running on device arrays stored on corresponding GPU*/ 
//for example this is just a few of the lines" 

thrust::device_ptr<double> pos_ptr = thrust::device_pointer_cast(particle[dev].pos); 
thrust::device_ptr<int> list_ptr = thrust::device_pointer_cast(particle[dev].list); 
thrust::sequence(list_ptr,list_ptr+length); 
thrust::sort_by_key(pos_ptr, pos_ptr+length,list_ptr); 
thrust::device_vector<double> temp(length); 
thrust::gather(list_ptr,list_ptr+length,pos_ptr,temp.begin()); 
thrust::copy(temp.begin(), temp.end(), pos_ptr);

我認爲還需要結構「顆粒[0]」將被存儲在GPU 0，粒子[1]上GPU 1等，我的猜測是不可能的。一個選項可能是針對每個GPU情況使用「開關」和單獨的代碼。

我想知道這是否是一個正確的做法，或者有更好的辦法嗎？感謝

來源

2014-02-06 user3281744

爲什麼你需要設置不同的主機線程的設備？你可以使用一個簡單的例子：[多GPU使用CUDA Thrust]（http://stackoverflow.com/questions/16885971/multi-gpu-usage-with-cuda-thrust）？ – JackOLantern

推測是一種解決某些推力操作阻塞主機線程的方法。 –

是的，你可以結合推力和OpenMP。

這裏有結果的完整的工作例如：

$ cat t340.cu 
#include <omp.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/sort.h> 
#include <thrust/copy.h> 
#include <time.h> 
#include <sys/time.h> 

#define DSIZE 200000000 

using namespace std; 

int main(int argc, char *argv[]) 
{ 
    timeval t1, t2; 
    int num_gpus = 0; // number of CUDA GPUs 

    printf("%s Starting...\n\n", argv[0]); 

    // determine the number of CUDA capable GPUs 
    cudaGetDeviceCount(&num_gpus); 

    if (num_gpus < 1) 
    { 
     printf("no CUDA capable devices were detected\n"); 
     return 1; 
    } 

    // display CPU and GPU configuration 
    printf("number of host CPUs:\t%d\n", omp_get_num_procs()); 
    printf("number of CUDA devices:\t%d\n", num_gpus); 

    for (int i = 0; i < num_gpus; i++) 
    { 
     cudaDeviceProp dprop; 
     cudaGetDeviceProperties(&dprop, i); 
     printf(" %d: %s\n", i, dprop.name); 
    } 

    printf("initialize data\n"); 


    // initialize data 
    typedef thrust::device_vector<int> dvec; 
    typedef dvec *p_dvec; 
    std::vector<p_dvec> dvecs; 

    for(unsigned int i = 0; i < num_gpus; i++) { 
     cudaSetDevice(i); 
     p_dvec temp = new dvec(DSIZE); 
     dvecs.push_back(temp); 
     } 

    thrust::host_vector<int> data(DSIZE); 
    thrust::generate(data.begin(), data.end(), rand); 

    // copy data 
    for (unsigned int i = 0; i < num_gpus; i++) { 
     cudaSetDevice(i); 
     thrust::copy(data.begin(), data.end(), (*(dvecs[i])).begin()); 
     } 

    printf("start sort\n"); 
    gettimeofday(&t1,NULL); 

    // run as many CPU threads as there are CUDA devices 
    omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices 
    #pragma omp parallel 
    { 
     unsigned int cpu_thread_id = omp_get_thread_num(); 
     cudaSetDevice(cpu_thread_id); 
     thrust::sort((*(dvecs[cpu_thread_id])).begin(), (*(dvecs[cpu_thread_id])).end()); 
     cudaDeviceSynchronize(); 
    } 
    gettimeofday(&t2,NULL); 
    printf("finished\n"); 
    unsigned long et = ((t2.tv_sec * 1000000)+t2.tv_usec) - ((t1.tv_sec * 1000000) + t1.tv_usec); 
    if (cudaSuccess != cudaGetLastError()) 
     printf("%s\n", cudaGetErrorString(cudaGetLastError())); 
    printf("sort time = %fs\n", (float)et/(float)(1000000)); 
    // check results 
    thrust::host_vector<int> result(DSIZE); 
    thrust::sort(data.begin(), data.end()); 
    for (int i = 0; i < num_gpus; i++) 
    { 
     cudaSetDevice(i); 
     thrust::copy((*(dvecs[i])).begin(), (*(dvecs[i])).end(), result.begin()); 
     for (int j = 0; j < DSIZE; j++) 
      if (data[j] != result[j]) { printf("mismatch on device %d at index %d, host: %d, device: %d\n", i, j, data[j], result[j]); return 1;} 
    } 
    printf("Success\n"); 
    return 0; 

} 
$ nvcc -Xcompiler -fopenmp -O3 -arch=sm_20 -o t340 t340.cu -lgomp 
$ CUDA_VISIBLE_DEVICES="0" ./t340 
./t340 Starting... 

number of host CPUs: 12 
number of CUDA devices: 1 
    0: Tesla M2050 
initialize data 
start sort 
finished 
sort time = 0.398922s 
Success 
$ ./t340 
./t340 Starting... 

number of host CPUs: 12 
number of CUDA devices: 4 
    0: Tesla M2050 
    1: Tesla M2070 
    2: Tesla M2050 
    3: Tesla M2070 
initialize data 
start sort 
finished 
sort time = 0.460058s 
Success 
$

我們可以看到，當我限制程序使用一個單一的設備，排序操作需要約0.4秒。然後，當我允許它使用全部4個設備（在所有4個設備上重複相同的排序）時，整個操作只需要0.46秒，即使我們做了4倍的工作。

對於我碰巧使用CUDA 5.0推力V1.7是這種特殊情況下，和gcc 4.4.6（RHEL 6.2）

來源

2014-02-08 03:46:22

回答

相關問題