在多個GPU上運行OpenCL內核？

現在我編寫了幾個在一個GPU上並行運行的算法，但是當我嘗試在幾個GPU上執行它們時（例如3），它們都有同樣的問題。問題是，在一個GPU上執行的代碼在3個GPU上執行完全相同的時間量（不會更快）。我試圖用更多的數據執行，嘗試執行不同的任務，沒有任何幫助。最後，我最終嘗試運行像元素總和這樣的最簡單的任務，並且仍然有這個可怕的錯誤。這就是爲什麼我不相信這是一個特定算法的問題，我覺得我的代碼存在一個錯誤（或者甚至在我的幾種GPU上並行化代碼的方法）。在多個GPU上運行OpenCL內核？

這裏是我的Parallel.cpp類的頭文件：

#ifndef PARALLEL_H 
#define PARALLEL_H 

#define __NO_STD_VECTOR // Use cl::vector and cl::string and 
#define __NO_STD_STRING // not STL versions, more on this later 
#include <CL/cl.h> 

class Parallel 
{ 
    public: 
     Parallel(); 
     int executeAttachVectorsKernel(int*, int*, int*, int); 
     static void getMaxWorkGroupSize(int*, int*, int*); 
     virtual ~Parallel(); 
    protected: 
    private: 
     char* file_contents(const char*, int*); 
     void getShortInfo(cl_device_id); 
     int init(void); 
     cl_platform_id platform; 
     cl_device_id* devices; 
     cl_uint num_devices; 
     cl_command_queue* queues; 
     int* WGSizes; 
     int* WGNumbers; 
     cl_context context; 
     cl_program program; 
     cl_kernel kernel; 
     cl_mem input1; 
     cl_mem input2; 
     cl_mem output; 
}; 

#endif // PARALLEL_H

下面是初始化方法的init：

int Parallel::init() { 
cl_int err; 

//Connect to the first platfrom 
err = clGetPlatformIDs(1, &platform, NULL); 
if (err != CL_SUCCESS) { 
    cerr << "Error occured while executing clGetPlatformIDs" << endl; 
    return EXIT_FAILURE; 
} 

//Get devices number 
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); 
if (err != CL_SUCCESS) { 
    cerr << "Error: Failed to create a device group:" << endl; 
    return EXIT_FAILURE; 
} 

cout << "NUM DEVICES =" << num_devices << endl; 

devices = new cl_device_id[num_devices]; 
//Get all the GPU devices 
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL); 

//Create one context for all the devices 
context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err); 
if (!context) { 
    cerr << "Error: Failed to create a compute context!" << endl; 
    return EXIT_FAILURE; 
} 

queues = new cl_command_queue[num_devices]; 
WGNumbers = new int[num_devices]; 
WGSizes = new int[num_devices]; 


for(int i = 0; i < num_devices; i++) { 
    //Create a command queue for every device 
    queues[i] = clCreateCommandQueue(context, devices[i], 0, &err); 
    if (!queues[i]) { 
     cerr << "Error: Failed to create a command commands!" << endl; 
     return EXIT_FAILURE; 
    } 

    cl_ulong temp; 
    clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(temp), &temp, NULL); 
    WGSizes[i] = (int)temp; 

    clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(temp), &temp, NULL); 
    WGNumbers[i] = (int)temp; 
} 

//Translate kernel code into chars 
int pl; 
size_t program_length; 
string path = "./kernel/kernel_av.cl"; 

char* cSourceCL = file_contents(path.c_str(), &pl); 
program_length = (size_t)pl; 

//Create a program 
program = clCreateProgramWithSource(context, 1, 
        (const char **) &cSourceCL, &program_length, &err); 

if (!program) { 
    cerr << "Error: Failed to create compute program!" << endl; 
    return EXIT_FAILURE; 
} 

//Create an executable 
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
if (err != CL_SUCCESS) 
{ 
    size_t len; 
    char buffer[2048]; 

    cerr << "Error: Failed to build program executable!" << endl; 
    exit(1); 
} 

// Create the compute kernel in the program 
kernel = clCreateKernel(program, "calculate2dim", &err); 
if (err != CL_SUCCESS) 
{ 
    cerr << "Error: Failed to create compute kernel!" << endl; 
    exit(1); 
} 
}

其執行內核的方法是在這裏：

int Parallel::executeAttachVectorsKernel(int* data1, int* data2, int* results, int vectors_num) { 

cl_int err; 
size_t global; // global domain size for our calculation 
size_t local; // local domain size for our calculation 

int partition = vectors_num/num_devices; 
unsigned int count = partition; 
input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL); 
input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL); 
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * count, NULL, NULL); 
if (!input1 || !input2 || !output) { 
    cerr << "Error: Failed to allocate device memory!" << endl; 
    exit(1); 
} 

int** data1_apart = new int*[num_devices]; 
int** data2_apart = new int*[num_devices]; 
int** results_apart = new int*[num_devices]; 

for(int i = 0; i < num_devices; i++) { 
    cout << "Executing parallel part on GPU " << i + 1 << endl; 
    cout << "Partition size = " << partition << endl; 
    data1_apart[i] = new int[partition]; 
    data2_apart[i] = new int[partition]; 
    results_apart[i] = new int[partition]; 

    for(int j = i*partition, k = 0; k < partition; j++, k++) { 
     data1_apart[i][k] = data1[j]; 
     data2_apart[i][k] = data2[j]; 
    } 

    //Transfer the input vector into device memory 
    err = clEnqueueWriteBuffer(queues[i], input1, 
           CL_TRUE, 0, sizeof(int) * count, 
           data1_apart[i], 0, NULL, NULL); 

    err = clEnqueueWriteBuffer(queues[i], input2, 
           CL_TRUE, 0, sizeof(int) * count, 
           data2_apart[i], 0, NULL, NULL); 

    if (err != CL_SUCCESS) 
    { 
     cerr << "Error: Failed to write to source array!" << endl; 
     exit(1); 
    } 

    int parameter4 = count/WGNumbers[i]; 

    //Set the arguments to the compute kernel 
    err = 0; 
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1); 
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2); 
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); 
    err |= clSetKernelArg(kernel, 3, sizeof(int), &parameter4); 
    if (err != CL_SUCCESS) 
    { 
     cerr << "Error: Failed to set kernel arguments! " << err << endl; 
     exit(1); 
    } 

    global = WGNumbers[i]; 
    local = WGSizes[i]; 

    if(local > global) { 
     local = global; 
    } 
    cout << "global = " << global << " local = " << local << endl; 

    err = clEnqueueNDRangeKernel(queues[i], kernel, 
           1, NULL, &global, &local, 
           0, NULL, NULL); 
    if (err) 
    { 
     cerr << "Error: Failed to execute kernel!" << endl; 
     return EXIT_FAILURE; 
    } 
} 

for(int i = 0; i < num_devices; i++) { 
    //Wait for all commands to complete 
    clFinish(queues[i]); 

    //Read back the results from the device to verify the output 

    err = clEnqueueReadBuffer(queues[i], output, 
           CL_TRUE, 0, sizeof(int) * count, 
           results_apart[i], 0, NULL, NULL); 
    if (err != CL_SUCCESS) 
    { 
     cerr << "Error: Failed to read output array! " << err << endl; 
     exit(1); 
    } 

    for(int j = 0; j < partition; j++) { 
     results[i*partition + j] = results_apart[i][j]; 
    } 

    delete [] data1_apart[i]; 
    delete [] data2_apart[i]; 
    delete [] results_apart[i]; 
} 

clReleaseMemObject(input1); 
clReleaseMemObject(input2); 
clReleaseMemObject(output); 
delete [] data1_apart; 
delete [] data2_apart; 
}

在將此問題發佈到stackoverflow之前，我一直在爭取2-3周這個問題，現在我真的編輯某人的幫助，所以我會高度讚賞任何想法和答案！

來源

2013-05-10 Vladimir

您的所有設備都在相同的緩衝區上運行。內核執行時，數據將在設備之間移動。沒有適當的同步，結果將是不確定的。

如果可能，請考慮爲每個設備分配一組不同的緩衝區。

來源

2013-05-10 18:02:25

這就是我現在正在做的和近乎實例（〜1毫秒的差異）執行內核爲重的工作負載（〜130ms工作） – 2013-05-24 16:51:33

這是我認爲正在發生的事情。您爲每個參與的opencl設備調用一次clEnqueueNDRangeKernel。此時，沒有任何內核開始執行，因爲clFlush尚未被調用。接下來，你爲每個隊列建立一個完整的隊列。第一次clFinish調用會導致第一個排隊的工作組運行。它也等待它完成。第一個工作組完成後，clFinish將控制權返回給您的應用程序。然後，您的應用程序會調用clFinish來獲取下一個隊列。這會觸發第二個工作泥漿運行，並等待它完成。所以這項工作按順序進行。解決方案可能很簡單，只要在每次調用clEnqueueNDRangeKernel之後立即調用clFush即可。這就是我的AMD系統的行爲。我將很快發佈一個工作示例。

來源

2013-05-13 03:33:37 ScottD

謝謝，我會試試這個！如果可能的話，請對這種方法提供更多評論！ – Vladimir 2013-05-13 12:12:22

這裏是工作示例[link] http://notabs.org/blcutil/wip/blcutil_devel-018.7z 下面是如何在一臺設備上運行它，然後是另一臺設備，然後再運行： [link] http：/ /notabs.org/blcutil/wip/sample-output.htm 命令行選項-opencl將使程序列表opencl設備。命令行選項-opencl =允許您選擇一個或多個列出的設備以供使用。 – ScottD 2013-05-13 16:51:53

是的，我認爲這是正確的。OpenCL規範對此非常具體：「請注意，當回調（或其他代碼）將命令排入命令隊列時，在隊列被刷新之前，不需要執行命令。」 – doug65536 2013-05-30 01:50:53

您正在使用哪種GPU？我有一個顯示在兩個GPU設備上的GTX590。當我試圖在兩臺設備上運行時，似乎都要等到每臺設備完成之後才轉移到下一臺設備上（即使它不是這樣）。我不知道Nvidia是否解決了這個問題。

閱讀我在Nvidia網站上看到的一些消息，當時我閱讀了一些關於Nvidia的建議，爲每個設備創建單獨的上下文，並在不同的線程中運行它們。這就是我所做的，它很棒。我爲此使用了pthreads（或SDL_threads）。這很容易設置。

來源

2013-05-18 15:14:32

在多個GPU上運行OpenCL內核？

回答

相關問題