2010-09-16 46 views
3

我構建了我自己的小Opencl示例,使用不同的網絡資源。實際的內核工作,我得到我想要的輸出,但我在其中一個例子中發現的清理函數會導致段錯誤。我做錯了什麼?OpenCl清理導致段錯誤

#include <stdio.h> 
#include <stdlib.h> 
#include <errno.h> 
#include <CL/cl.h> //opencl 

#define CL_CHECK(_expr)               \ 
    do {                   \ 
    cl_int _err = _expr;              \ 
    if (_err == CL_SUCCESS)             \ 
     break;                 \ 
    fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ 
    abort();                 \ 
    } while (0) 

#define CL_CHECK_ERR(_expr)              \ 
    ({                   \ 
    cl_int _err = CL_INVALID_VALUE;           \ 
    typeof(_expr) _ret = _expr;            \ 
    if (_err != CL_SUCCESS) {             \ 
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ 
     abort();                 \ 
    }                   \ 
    _ret;                  \ 
    }) 

const char* OpenCLSource[] = { 
     "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)", 
     "{", 
     "  // Index of the elements to add \n", 
     "  unsigned int n = get_global_id(0);", 
     "  // Sum the n’th element of vectors a and b and store in c \n", 
     "  c[n] = a[n] + b[n];", 
     "}" 
}; 

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){ 

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU 
    cl_int _err; 
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ; 
    printf("\n1-%i\n",_err); 
    // Get the list of GPU devices associated with this context 
    size_t ParmDataBytes; 
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes)); 
    cl_device_id* GPUDevices; 
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes); 
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL)); 
    // Create a command-queue on the first GPU device 
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err); 
    printf("\n2-%i\n",_err); 
    // Create OpenCL program with source code 
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err); 
    printf("\n3-%i\n",_err); 

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
       NULL, NULL, NULL, NULL)); 


    cl_int errcode; 
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
       "VectorAdd", &errcode); 
       printf("\n7-%i\n",errcode); 

    return GPUDevices; 
} 


int main(int argc, char** argv) 
{ 
    cl_context GPUContext; 
    cl_command_queue GPUCommandQueue; 
    cl_program OpenCLProgram; 
    cl_kernel OpenCLVectorAdd; 
    cl_device_id* GPUDevices; 

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram); 

    // Two integer source vectors in Host memory 
    int n=5 ; 
    int x[5]={1,2,4,6,8}; 
    int y[5]={1,2,4,6,8}; 
    int output[n]; 
    int size_x = n*sizeof(x); 
    int size_y = n*sizeof(y); 

    int size_output = n*sizeof(output); // this changes for the second forward1 
    cl_int _err; 
    // Allocate GPU memory for source vectors AND initialize from CPU memory 
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | 
        CL_MEM_COPY_HOST_PTR, size_x, x, &_err); 
        printf("\n4-%i\n",_err); 
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | 
        CL_MEM_COPY_HOST_PTR, size_y, y, &_err); 
        printf("\n5-%i\n",_err); 


    // Allocate output memory on GPU 
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, 
              size_output, NULL, &_err); 
              printf("\n6-%i\n",_err); 

    // In the next step we associate the GPU memory with the Kernel arguments 
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl); 
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl); 
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl); 


    // 7. Launch OpenCL kernel 
    size_t localWorkSize[1], globalWorkSize[1]; 
    //localWorkSize = ; 
    globalWorkSize[0] = n; 

    // Launch the Kernel on the GPU 
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL)); 
    // Copy the output in GPU memory back to CPU memory 

    //float* h_C = (float*) malloc(size_output); 
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
       total_cl, CL_TRUE, 0, size_output, 
       output, 0, NULL, NULL)); 
    for (int i=0; i<n;i++){ 
     printf("\n%i",output[i]); 
    } 

    // Cleanup (each of the following lines causes a seg fault 
    // ****************************** 
    CL_CHECK(free(GPUDevices)); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd)); 
    CL_CHECK(clReleaseProgram(OpenCLProgram)); 
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue)); 
    CL_CHECK(clReleaseContext(GPUContext)); 
    CL_CHECK(clReleaseMemObject(total_cl)); 
    CL_CHECK(clReleaseMemObject(x_cl)); 
    CL_CHECK(clReleaseMemObject(y_cl)); 
    /* **************** 

    return 0; 
} 

謝謝!

+0

什麼確切的線路崩潰? – 2010-09-16 14:27:54

+0

您是否應該在讀取輸出數組之前等待排隊的讀命令完成? – 2010-09-16 17:11:06

+0

對於這個問題,在入隊的內核調用之後是不是應該等待呢? – 2010-09-16 17:12:11

回答

-2

我糾正並改變了幾件小事。所以這段代碼現在應該可以工作

#include <stdio.h> 
#include <stdlib.h> 
#include <errno.h> 
#include <CL/cl.h> //opencl 

#define CL_CHECK(_expr)               \ 
    do {                   \ 
    cl_int _err = _expr;              \ 
    if (_err == CL_SUCCESS)             \ 
     break;                 \ 
    fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ 
    abort();                 \ 
    } while (0) 

#define CL_CHECK_ERR(_expr)              \ 
    ({                   \ 
    cl_int _err = CL_INVALID_VALUE;           \ 
    typeof(_expr) _ret = _expr;            \ 
    if (_err != CL_SUCCESS) {             \ 
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \ 
     abort();                 \ 
    }                   \ 
    _ret;                  \ 
    }) 

const char* OpenCLSource[] = { 
     "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)", 
     "{", 
     "  // Index of the elements to add \n", 
     "  unsigned int n = get_global_id(0);", 
     "  // Sum the n’th element of vectors a and b and store in c \n", 
     "  c[n] = a[n] + b[n];", 
     "}" 
}; 

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){ 

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU 
    cl_int _err; 
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ; 
    printf("\nclCreateContextFromType:%i\n",_err); 
    // Get the list of GPU devices associated with this context 
    size_t ParmDataBytes; 
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes)); 
    cl_device_id* GPUDevices; 
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes); 
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL)); 
    // Create a command-queue on the first GPU device 
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err); 
    printf("\nclCreateCommandQueue:%i\n",_err); 
    // Create OpenCL program with source code 
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err); 
    printf("\nclCreateProgramWithSource:%i\n",_err); 

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
       NULL, NULL, NULL, NULL)); 


    cl_int errcode; 
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
       "VectorAdd", &errcode); 
       printf("\nclCreateKernel:%i\n",errcode); 

    return GPUDevices; 
} 


int main(int argc, char** argv) 
{ 
    cl_context GPUContext; 
    cl_command_queue GPUCommandQueue; 
    cl_program OpenCLProgram; 
    cl_kernel OpenCLVectorAdd; 
    cl_device_id* GPUDevices; 

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram); 

    int n=5 ; 
    int x[5]={1,2,4,6,8}; 
    int y[5]={1,2,4,6,8}; 
    int output[n]; 
    int size_x = n*sizeof(x); 
    int size_y = n*sizeof(y); 
    int size_output = n*sizeof(output); 

    cl_int _err; 

    // Allocate GPU memory for source vectors AND initialize from CPU memory 
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | 
        CL_MEM_COPY_HOST_PTR, size_x, x, &_err); 
        printf("\nclCreateBuffer:%i\n",_err); 
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | 
        CL_MEM_COPY_HOST_PTR, size_y, y, &_err); 
        printf("\nclCreateBuffer:%i\n",_err); 


    // Allocate output memory on GPU 
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, 
              size_output, NULL, &_err); 
              printf("\nclCreateBuffer:%i\n",_err); 

    // In the next step we associate the GPU memory with the Kernel arguments 
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl); 
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl); 
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl); 


    size_t globalWorkSize[1]; 
    globalWorkSize[0] = n; 

    // Launch the Kernel on the GPU 
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL)); 
    clFinish(GPUCommandQueue); 
    // Copy the output in GPU memory back to CPU memory 

    int* h_c = (int*) malloc(size_output); 
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
       total_cl, CL_TRUE, 0, size_output, 
       h_c, 0, NULL, NULL)); 
    clFinish(GPUCommandQueue); 
    for (int i=0; i<n;i++){ 
     printf("\noutput[%i]=%i",i,h_c[i]); 
    } 

    // Cleanup 
    free(GPUDevices); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd)); 
    CL_CHECK(clReleaseProgram(OpenCLProgram)); 
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue)); 
    CL_CHECK(clReleaseContext(GPUContext)); 
    CL_CHECK(clReleaseMemObject(x_cl)); 
    CL_CHECK(clReleaseMemObject(total_cl)); 
    CL_CHECK(clReleaseMemObject(y_cl)); 

    return 0; 
} 
+4

這沒有什麼幫助,沒有解釋問題是什麼或者你修改的代碼中的內容修復了它 – o0rebelious0o 2014-02-27 19:12:56

0

誰的人來到這裏的未來:

由於Brafford建議,這是由clEnqueueNDRangeKernel以及clEnqueueReadBuffer後加入clFinish(GPUCommandQueue)解決。

顯然嘗試清理仍在執行中的任何對象(例如釋放隊列)會產生分段錯誤。