運行時OpenCL錯誤計算矩陣乘法

過去幾天我一直在調試，無法運行OpenCL矩陣乘法內核。每當我運行該程序時，來自GPU的輸出都會產生類似於-198746573.0000的大負數。我想知道是否有HPC經驗的人可以在我的代碼中指出錯誤，或者是否與驅動程序有關。運行時OpenCL錯誤計算矩陣乘法

#include <stdio.h> 
#include <stdlib.h> 
#include <time.h> 
#include <string.h> 

#define widthA 2 
#define heightA 2 

#define widthB heightA 
#define heightB 2 

#define widthC widthA 
#define heightC heightB 

#ifdef __APPLE__ 
#include <OpenCL/opencl.h> 
#else 
#include <opencl.h> 
#endif 

#define MEM_SIZE (128) 
#define MAX_SOURCE_SIZE (0x100000) 

int main() 
{ 
    float * A = (float *)malloc(sizeof(float)*widthA*heightA); 
    float * B = (float *)malloc(sizeof(float)*widthB*heightB); 
    float * C = (float *)malloc(sizeof(float)*widthC*heightC); 
    float * Res = (float *)malloc(sizeof(float)*widthC*heightC); 
    float * D= (float *)malloc(sizeof(float)*widthC*heightC); 

    float ref[widthC][heightC]; 

    int i, j, k; 

    FILE * fp1 = fopen("matAdata.txt", "w"); 
    if (!fp1) { 
    fprintf(stderr, "Failed to open matAdata.\n"); 
    exit(1); 
    } 

    for(i = 0;i < widthA; i++) 
    { 
     for(j=0;j < heightA; j++)  { 
      float p=(rand()%100)/7.0; 
      //*(A+i*heightA+j)=rand()%100 + p; 
      *(A+i*heightA+j)=4.0; 
      fprintf(fp1, "%f ",*(A+i*heightA+j)); 
     } 
     fprintf(fp1, "\n"); 
    } 
    fclose(fp1); 

    fp1 = fopen("matBdata.txt", "w"); 
    if (!fp1) { 
    fprintf(stderr, "Failed to open matAdata.\n"); 
    exit(1); 
    } 


    for(i = 0;i < widthB; i++) 
    { 
     for(j=0; j < heightB; j++)  { 
      float p=(rand()%100)/7.0; 
      //*((B+i*heightB+j))=rand()%100 + p; 
      *((B+i*heightB+j))=4.0; 
      fprintf(fp1, "%f ",*(B+i*heightA+j)); 
     } 
     fprintf(fp1, "\n"); 
    } 
    fclose(fp1); 

    cl_device_id device_id = NULL; 
    cl_context context = NULL; 
    cl_command_queue command_queue = NULL; 
    cl_mem memobjA = NULL; 
    cl_mem memobjB = NULL; 
    cl_mem memobjC = NULL; 
    cl_mem rowA = NULL; 
    cl_mem colC = NULL; 
    cl_program program = NULL; 
    cl_kernel kernel = NULL; 
    cl_platform_id platform_id[10]; 
    cl_platform_id platform = NULL; 
    cl_uint ret_num_devices; 
    cl_uint ret_num_platforms; 
    cl_int ret; 
    cl_event GPUDone[0]; 
    //char string[MEM_SIZE]; 

    FILE *fp; 
    char fileName[] = "matrixMultiplication.cl"; 
    char *source_str; 
    size_t source_size; 
    int row = widthA; 
    int col = heightC; 
    /* Load the source code containing the kernel*/ 
    fp = fopen(fileName, "r"); 
    if (!fp) { 
    fprintf(stderr, "Failed to load kernel.\n"); 
    exit(1); 
    } 
    source_str = (char*)malloc(MAX_SOURCE_SIZE); 
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); 
    fclose(fp); 

    /* Get Platform and Device Info */ 
    ret = clGetPlatformIDs(10, platform_id, &ret_num_platforms); 

    char cBuffer[1024]; 
    cl_uint c; 

    for(c = 0; c < ret_num_platforms; c++) 
    { 
    clGetPlatformInfo(platform_id[c], CL_PLATFORM_NAME, 1024, &cBuffer, NULL); 
    if (strstr(cBuffer, "NVIDIA") != NULL) 
    { 
     platform = platform_id[c]; 
     break; 
    } 

    } 

    printf("Found Platform %s\n", cBuffer); 

    ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); 

    printf("Found %d devices.\n", ret_num_devices); 

    /* Create OpenCL context */ 
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); 

    /* Create Command Queue */ 
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret); 

    /* Create Memory Buffer */ 
    memobjA = clCreateBuffer(context, CL_MEM_READ_ONLY, widthA * heightA * sizeof(float), NULL, &ret); 
    memobjB = clCreateBuffer(context, CL_MEM_READ_ONLY, widthB * heightB * sizeof(float), NULL, &ret); 
    memobjC = clCreateBuffer(context, CL_MEM_READ_WRITE, widthC * heightC * sizeof(float), NULL, &ret); 
    rowA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret); 
    colC = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, &ret); 

    // Copy the lists A and B to their respective memory buffers 
    ret = clEnqueueWriteBuffer(command_queue,memobjA, CL_TRUE, 0, 
      widthA * heightA * sizeof(float), A, 0, NULL, NULL); 
    ret = clEnqueueWriteBuffer(command_queue, memobjB, CL_TRUE, 0, 
      widthB * heightB * sizeof(float), B, 0, NULL, NULL); 
    ret = clEnqueueWriteBuffer(command_queue, rowA, CL_TRUE, 0, sizeof(int), &row, 0, NULL, NULL); 
    ret = clEnqueueWriteBuffer(command_queue, colC, CL_TRUE, 0, sizeof(int), &col, 0, NULL, NULL); 

    /* Create Kernel Program from the source */ 
    program = clCreateProgramWithSource(context, 1, (const char **)&source_str, 
             (const size_t *)&source_size, &ret); 

    /* Build Kernel Program */ 
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); 

    /* Create OpenCL Kernel */ 
    kernel = clCreateKernel(program, "matrixMultiplication", &ret); 

    /* Set OpenCL Kernel Arguments */ 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobjA); 
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memobjB); 
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&memobjC); 
    ret = clSetKernelArg(kernel, 3, sizeof(int), (void *)&row); 
    ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&col); 
    /* Execute OpenCL Kernel */ 

    //ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL); 
    size_t globalThreads[2] = {widthA, heightB}; 
    size_t localThreads[2] = {16,16}; 

    clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL); 
    //clFlush(command_queue); 
    //clFinish(command_queue); 

    /* Copy results from the memory buffer */ 
    ret = clEnqueueReadBuffer(command_queue, memobjC, CL_TRUE, 0, 
          widthA * heightC * sizeof(float), Res, 0, NULL, &GPUDone[0]); 

    printf("Buffer Read ended with %d.\n", ret); 
    clWaitForEvents(1, GPUDone); 

    fp1 = fopen("matGPURes.txt", "w"); 
    if (!fp1) { 
    fprintf(stderr, "Failed to open matAdata.\n"); 
    exit(1); 
    } 

    printf("\nResult\n"); 
    for(i = 0;i < widthA; i++) 
    { 
     for(j=0;j < heightC; j++) 
     { 

      fprintf(fp1, "%f ",*(Res+i*heightC+j)); 
      ref[i][j] = *(Res+i*heightC+j); 
      printf("GPU Output: %f\n", *(Res+i*heightC+j)); 
     } 
     fprintf(fp1, "\n"); 
    } 
    fclose(fp1); 

    ret = clFlush(command_queue); 
    ret = clFinish(command_queue); 
    ret = clReleaseKernel(kernel); 
    ret = clReleaseProgram(program); 
    ret = clReleaseMemObject(memobjA); 
    ret = clReleaseMemObject(memobjB); 
    ret = clReleaseMemObject(memobjC); 
    ret = clReleaseCommandQueue(command_queue); 
    ret = clReleaseContext(context); 
    ret = clReleaseEvent(GPUDone[0]); 

    free(source_str); 

    float sum=0.0; 

    for(i = 0;i < widthA; i++) 
    { 
     for(j = 0; j < heightC; j++) 
     { 
      sum = 0; 
      for(k = 0; k < widthB; k++) 
      { 
       sum += A[i*col+k] * B[k*row+j]; 
       printf("Multiplying A: %f, B: %f\n", A[i*col+k], B[k*row+j]); 
      } 
     D[i*heightC+j] = sum; 
     } 

    } 

    fp1 = fopen("matNormalMultiplicationRes.txt", "w"); 

    if (!fp1) { 
    fprintf(stderr, "Failed to open matNormalMultiplicationRes.txt\n"); 
    exit(1); 
    } 

    for(i = 0; i<widthA; i++) 
    { 
     for(j = 0; j<heightA; j++) 
     { 
      if (ref[i][j] != D[i*heightA+j]) 
      { 
       printf("Calculation error[ CPU: %f, GPU: %f ]\n", D[i*heightA+j], ref[i][j]); 
      } 
     } 
    } 

    printf("\nResult\n"); 
    for(i = 0;i < widthA; i++) 
    { 
     for(j=0;j < heightC; j++) 
     { 
      fprintf(fp1, "%f ",*(D+i*heightC+j)); 

     } 
     fprintf(fp1, "\n"); 
    } 
    free(A); 
    free(B); 
    free(C); 
    free(D); 
    free(Res); 
    return 0; 
}

這裏是內核

#define BLOCK_SIZE 16 

__kernel 
void matrixMultiplication(__global float* A, __global float* B, __global float* C, int wA, int wB) 
{ 
    //int i = get_global_id(0); 
    //int j = get_global_id(1); 

    float Csub = 0.0f;   

    int bx = get_group_id(0); 
    int by = get_group_id(1); 

    int tx = get_local_id(0); 
    int ty = get_local_id(1); 

    int aBegin = wA * BLOCK_SIZE * by; 
    int aEnd = aBegin + wA - 1; 
    int aStep = BLOCK_SIZE; 

    int bBegin = BLOCK_SIZE * bx; 
    int bStep = BLOCK_SIZE * wB; 

    for (int a = aBegin, b=bBegin; 
     a <= aEnd; 
     a += aStep, b+=bStep) 
    { 
     __local float As[BLOCK_SIZE][BLOCK_SIZE]; 
     __local float Bs[BLOCK_SIZE][BLOCK_SIZE]; 

     As[ty][tx] = A[a + wA * ty + tx]; 
     Bs[ty][tx] = B[b + wB * ty + tx]; 
     barrier(CLK_LOCAL_MEM_FENCE); 

     for(int k = 0; k < BLOCK_SIZE; ++k) 
      Csub += As[ty][k] * Bs[k][tx]; 
     barrier(CLK_LOCAL_MEM_FENCE); 

    } 

    int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; 
    C[c + wB * ty + tx] = Csub; 
    /* 
    float value=0; 
    for (int k = 0; k < widthA; k++) 
    { 
     value = value + A[k + j * widthA] * B[k*widthB + i]; 
    } 
    C[i + widthA * j] = value; 
    */ 
}

我有雙重檢查了一遍又一遍，但根本無法找到任何錯誤。我想在確定它的驅動程序問題之前確保它沒有代碼錯誤。

謝謝！

來源

2012-08-03 maknelly

我沒有看到Csub在內核的任何地方被定義和初始化。 – 2012-08-04 13:45:37

感謝您的注意，我在調試OpenCL內核時遇到了這個錯誤，並且我上傳了一個過時的版本。 – maknelly 2012-08-06 16:40:07

我試圖在AMD硬件上運行這個工作，這對我來說很有用。將localkreads更改爲NULL，像這樣clEnqueueNDRangeKernel（command_queue，kernel，2，NULL，globalThreads，NULL，0，NULL，NULL）;並在內核中修改塊大小爲4，即#define BLOCK_SIZE 4，現在我從GPU – kiranputtur 2012-08-11 09:19:04

你真的需要這樣一個複雜的內核嗎？如果你真的想做簡單的矩陣乘法運算，你可以編寫一個簡單的內核，這很容易調試。

__kernel void matrixMultiplication (__global float* A, 
             __global float* B, 
             __global float* C, 

             int widthA, int widthB) 
{ 
    //y direction 
    int row = get_global_id(1); 

    int col = get_global_id(0); 

    float cSum = 0.0f; 

    //calculate the result 
    for (int i=0; i<widthA; i++) 
    { 
     cSum += A[row*widthA+ i] * B[i*widthB+col]; 
    } 

    C[row*widthB+col] = cSum; 
}

來源

2012-08-04 12:36:41 kiranputtur

得到正確的答案。這可能很簡單，但速度也很慢。我不知道他的內核是否正確，但使用本地內存的想法是正確的。 – 2012-08-04 17:15:45

我同意這是緩慢的，想法是在他進一步開始調整它。 – kiranputtur 2012-08-05 08:47:45

我同意你和我實際上已經嘗試過上述。它仍然會從GPU返回我的大負數。我試圖用英特爾sdk在我的英特爾處理器上運行它，並且我得到所有的0都沒有結果。當檢查每個操作的返回標誌時，它將返回一個CL_SUCCESS。任何人都可以嘗試在他們的平臺上運行上面的代碼，看看他們是否得到相同的結果？ – maknelly 2012-08-06 16:37:11

檢查您的主機的功能。這裏有幾件事讓你開始...

1）你不需要創建一個緩衝區並將其排入標量常量int，如行和列。只需將其設置爲內核參數。

2）等待帶有事件的clEnqueueNDRangeKernel。你想確定calc已經完成。

3）在內核中添加一個printf語句來打印選定的值以查看輸入和輸出值是否符合您的期望值。

嘗試

如果（get_local_id（0）％8 == 0）{

printf some useful value of a,b,c

}

3）嘗試用啞內核拷貝的輸入陣列的主機代碼到輸出數組。這將確認你有緩衝區創建處理和條件讀/寫代碼正確！

來源

2012-08-09 23:24:57

此案例可能已關閉，但爲了google-comers：不應該在主機上顯式聲明共享內存並將其作爲內核參數傳遞給源代碼？在這種情況下，__local關鍵字不是您正在尋找的關鍵字。

有關詳細說明，請參閱How to declare local memory in OpenCL?的文章。

來源

2012-12-27 21:29:48 szz

運行時OpenCL錯誤計算矩陣乘法

回答

相關問題