在OpenCL上讀取錯誤的數據

我面臨內核在錯誤位置寫入數據或主機有時讀取數據錯誤的問題。我將相同的數據（我寫數據的索引）寫入兩個不同類型的全局數組。爲了確保索引是正確的，使用通過atom_inc遞增的全局計數器。從主機上的第二個數組讀取數據時發生此問題。例如：在OpenCL上讀取錯誤的數據

..... 
output array index: 442: (output1 value:442.0000  output2 value:442) 
output array index: 443: (output1 value:443.0000  output2 value:443) 
output array index: 444: (output1 value:444.0000  output2 value:444) 
output array index: 445: (output1 value:445.0000  output2 value:445) 
output array index: 446: (output1 value:446.0000  output2 value:1152892928) 
output array index: 447: (output1 value:447.0000  output2 value:447) 
output array index: 448: (output1 value:448.0000  output2 value:1152909312) 
output array index: 449: (output1 value:449.0000  output2 value:1152917504) 
output array index: 450: (output1 value:450.0000  output2 value:1152925696) 
......

正如你可以在indicies 446，見448，449和450+輸出2包含錯誤的價值觀。這可能是什麼原因？

設備：ATI的Radeon HD5750

代碼示例：

#include <stdio.h> 
#include <math.h> 
#include <OpenCL/OpenCL.h> 

// wtf example 
const char *programSource = 
"__kernel void kernel1(__global uint *counter,\n" \ 
"__global float *weights,\n" \ 
"__global uint *weights_pos)\n" \ 
"{\n"\ 
"const uint global_size = get_global_size(0);\n" \ 
"const uint global_id = get_global_id(0);\n" \ 
"uint local_id = get_local_id(0);\n" \ 

"if(global_id == 0) {\n" \ 
"counter[5] = 0; // set index of pos in weights to zero\n" \ 
"}\n" \ 

"uint insert_index = atom_inc(&counter[5]);\n" \ 
"weights[insert_index] = insert_index;\n" \ 
"weights_pos[insert_index] = insert_index;\n" \ 
"}"; 

void art_process_sinogram(const char* tiff_filename, 
          const float *angles2, 
          const unsigned int n_angles2, 
          const unsigned int n_ray2s, 
          const float distanc2e) 
{ 
    /****************************** 
    * OPENCL ENVIRONMENT 
    */ 
    cl_int status; 
    cl_uint numPlatforms = 0; 
    cl_platform_id *platforms = NULL; 
    cl_device_id device_id; 

    //discover platforms 
    status = clGetPlatformIDs(0, NULL, &numPlatforms); 
    platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id)); 
    status = clGetPlatformIDs(numPlatforms, platforms, NULL); 

    //discover devices 
    cl_uint numDevices = 0; 
    cl_device_id *devices = NULL; 

    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); 
    devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); 
    status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); 
    device_id = devices[1]; 
    //create context 
    cl_context context = NULL; 
    context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); 

    cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status); 
    clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
    cl_kernel kernel_weights = clCreateKernel(program, "kernel1", &status); 

    //create queue 
    cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status); 

    /****************************** 
    * HARDWARE PARAMETERS 
    */ 
    cl_uint wavefronts_per_SIMD = 7; 
    size_t global_work_size; 
    size_t local_work_size = 64; 

    cl_uint max_compute_units; 

    clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); 

    size_t wg_count = max_compute_units * wavefronts_per_SIMD; 
    global_work_size = wg_count * local_work_size; 

    /**************************** DATA PART *************************************/ 

    size_t w_portion_size = 768 * sizeof(cl_float); 
    size_t w_pos_portion_size = 768 * sizeof(cl_uint); 

    size_t counters_data_size = 6 * sizeof(cl_uint); 
    cl_uint counters_data[6]; 
    counters_data[0] = 1; 
    counters_data[1] = 2; // max number of the cells intersected by the ray 
    counters_data[2] = 3; 
    counters_data[3] = 4; 
    counters_data[4] = 5; // same to the number of rays 
    counters_data[5] = 0; // counter inside kernel 

    /***************** 
    * Main buffers 
    */ 
    cl_mem weights1_buffer = clCreateBuffer(context, 
              CL_MEM_READ_WRITE, 
              w_portion_size, 
              NULL, 
              NULL); 

    cl_mem weights_pos1_buffer = clCreateBuffer(context, 
               CL_MEM_READ_WRITE, 
               w_pos_portion_size, 
               NULL, 
               NULL); 
    /***************** 
    * Supplement buffers (constant) 
    */ 
    cl_mem counters_data_buffer = clCreateBuffer(context, 
               CL_MEM_READ_ONLY, 
               counters_data_size, 
               NULL, 
               &status); 


    cl_event supplement_buffer_ready[1]; 

    status = clEnqueueWriteBuffer(command_queue1, 
           counters_data_buffer, 
           CL_FALSE, 
           0, 
           counters_data_size, 
           counters_data, 
           0, 
           NULL, 
           &supplement_buffer_ready[0]); 

    status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer); 
    status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer); 
    status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer); 

    status = clEnqueueNDRangeKernel(command_queue1, 
            kernel_weights, 
            1, // work dimensional 1D, 2D, 3D 
            NULL, // offset 
            &global_work_size, // total number of WI 
            &local_work_size, // nomber of WI in WG 
            1, // num events in wait list 
            supplement_buffer_ready, // event wait list 
            NULL); // event 

    clFinish(command_queue1); 
    cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1, 
                 weights1_buffer,//*pmain_weights_buffer, 
                 CL_TRUE, 
                 CL_MAP_READ, 
                 0, 
                 w_portion_size, 
                 0, NULL, NULL, NULL); 
    cl_uint *output2 = malloc(w_portion_size); 
    status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer, 
           CL_TRUE, 0, w_pos_portion_size, output2, 
           0, NULL, NULL); 

    clFinish(command_queue1); 
    for(int i = 0; i < 790; ++i) { 
    printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]); 
    } 
}

SOLUTION：

內核應該是的樣子（需要檢查指標）：

__kernel void k_1(__global uint *counter, 
        __global uint *weights, 
        __global uint2 *weights_pos) 
{ 
    const uint global_size = get_global_size(0); 
    const uint global_id = get_global_id(0); 
    uint local_id = get_local_id(0); 

    uint insert_index = atom_inc(&counter[5]); 
    if(insert_index < 768) { 
     weights[insert_index]= insert_index; 
     weights_pos[insert_index].x = insert_index; 
     weights_pos[insert_index].y = insert_index; 
    } 
}

來源

2012-05-19 Roman Shkarin

發表一些實際的代碼，而不是隻是其目的地，一旦你得到了你的問題回答你無論如何都會刪除鏈接。 –

@Christian Rau，是的，你是對的 –

我在英特爾實施上測試了你的代碼，並且我得到了每一個單一值的損壞。你的代碼對我來說似乎很好。 – sbabbi

內核應該是看起來像（需要檢查指標）：

__kernel void k_1(__global uint *counter, 
       __global uint *weights, 
       __global uint2 *weights_pos) 
{ 
    const uint global_size = get_global_size(0); 
    const uint global_id = get_global_id(0); 
    uint local_id = get_local_id(0); 

    uint insert_index = atom_inc(&counter[5]); 

    if(insert_index < 768) { 
    weights[insert_index]= insert_index; 
    weights_pos[insert_index].x = insert_index; 
    weights_pos[insert_index].y = insert_index; 
    } 
}

來源

2012-05-20 03:52:20

你正在搞亂緩衝區尺寸。

1）你的緩衝區包含768元每個（見我的機器上的w_portion_size初始化和w_pos_portion_size）

2）工作組大小爲896（見wg_count初始化）

3）你打印出790個值。

除此之外，一個概念上的錯誤是在這裏：

if(global_id == 0) { 
    counter[5] = 0; // set index of pos in weights to zero 
} 
//atomic increments on counter[5]

你不能假設第一虛擬處理器將別人之前執行這條線。由於您在主機端初始化了counter[5]，因此應完全刪除此行。（我相信這是你的問題的原因，但我不能重現）。

修復這些問題後，您的代碼似乎運行良好（英特爾實施）。

來源

2012-05-19 16:23:21 sbabbi

謝謝你的努力。我同意你的發言，但無論如何修復它並沒有帶來任何積極的結果。 –

在OpenCL上讀取錯誤的數據

回答

相關問題