2011-04-15 94 views
0

我嘗試在我的OpenCL程序中實現內存映射技術,但它不起作用!這是我的內核代碼:opencl映射內存不起作用

__kernel void update(__global char *in, __global char *out) 
{ 
    size_t i; 
    for (i = 0; i < 10; i++); 
     out[i] += 'A' - 'a'; 
} 

這是主機代碼:

cl_platform_id platformId = NULL; 
cl_device_id deviceId = NULL; 

cl_context context = NULL; 
cl_command_queue commandQueue = NULL; 

cl_mem cmPinnedBufIn = NULL; 
cl_mem cmPinnedBufOut = NULL; 
cl_mem cmDevBufIn = NULL; 
cl_mem cmDevBufOut = NULL; 
unsigned char *cDataIn = NULL; 
unsigned char *cDataOut = NULL; 

cl_program program = NULL; 
cl_kernel kernel = NULL; 
cl_uint retNumDevices; 
cl_uint retNumPlatforms; 
cl_int ret; 

cl_event event; 
cl_ulong start; 
cl_ulong end; 

size_t group_size = GLOBAL_ITEM_SIZE/LOCAL_ITEM_SIZE; 

FILE *fp; 
const char fileName[] = "./update.cl"; 
size_t sourceSize; 
char *sourceStr; 

unsigned char tt[10]; 

/* Load kernel source file */ 
if (!(fp = fopen(fileName, "r"))) 
    quitErr("Failed to load kernel.", EXIT_FAILURE); 

sourceStr = (char *)malloc(MAX_SOURCE_SIZE); 
sourceSize = fread(sourceStr, 1, MAX_SOURCE_SIZE, fp); 
fclose(fp); 

/* Get Platform/Device Information */ 
ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms); 
assert(ret == CL_SUCCESS); 
ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_GPU, 1, &deviceId, &retNumDevices); 
assert(ret == CL_SUCCESS); 

/* Create OpenCL Context */ 
context = clCreateContext(NULL, retNumDevices, &deviceId, NULL, NULL, &ret); 

/* Create command queue with measurment of preformance */ 
commandQueue = clCreateCommandQueue(context, deviceId, CL_QUEUE_PROFILING_ENABLE, &ret); 

/* Create buffer objects */ 
size_t memSize = 10 * sizeof(unsigned char); 
cmPinnedBufIn = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, memSize, NULL, &ret); 
assert(ret == CL_SUCCESS); 
cmPinnedBufOut = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, memSize, NULL, &ret); 
assert(ret == CL_SUCCESS); 

/* Mapp pinned memmory */ 
cDataIn = (unsigned char *)clEnqueueMapBuffer(commandQueue, cmPinnedBufIn, CL_TRUE, CL_MAP_WRITE, 0, memSize, 0, NULL, NULL, &ret); 
assert(ret == CL_SUCCESS); 

/* Initialize data */ 
for (size_t w = 0; w < memSize; w++) 
    cDataIn[w] = 'a' + w; 

/* Create kernel program from source file */ 
program = clCreateProgramWithSource(context, 1, (const char **)&sourceStr, (const size_t *)&sourceSize, &ret); 
assert(ret == CL_SUCCESS); 
ret = clBuildProgram(program, 1, &deviceId, NULL, NULL, NULL); 
if (ret != CL_SUCCESS) { 
    error("\nFail to build the program\n"); 
    char buffer[10240]; 
    clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); 
    quitErr(buffer, EXIT_FAILURE); 
} 

/* Create data parallel OpenCL kernel */ 
kernel = clCreateKernel(program, "update", &ret); 
assert(ret == CL_SUCCESS); 

/* Set OpenCL kernel arguments */ 
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&cmPinnedBufIn); 
assert(ret == CL_SUCCESS); 

ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&cmPinnedBufOut); 
assert(ret == CL_SUCCESS); 

size_t global_item_size = GLOBAL_ITEM_SIZE; 
size_t local_item_size = LOCAL_ITEM_SIZE; 

/* Execute OpenCL kernel as data parallel */ 
ret = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &event); 
if (ret == CL_INVALID_WORK_GROUP_SIZE) 
    quitErr("Invalid work group size: error when compute group size.", EXIT_FAILURE); 
assert(ret == CL_SUCCESS); 

/* Execute measurment issue */ 
if (preformanceMeas) { 
    clWaitForEvents(1, &event); 
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); 
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); 
    printf("Kernels execution time: %10.6f [ms]\n", (end - start) * 1.0e-6f); 
} 

cDataOut = (unsigned char *)clEnqueueMapBuffer(commandQueue, cmPinnedBufOut, CL_TRUE, CL_MAP_READ, 0, memSize, 0, NULL, NULL, &ret); 
assert(ret == CL_SUCCESS); 

/* Transfer result to host */ 
memcpy(tt, cDataOut, memSize); 

/* Transfer measurment issue */ 
if (preformanceMeas) { 
    //clWaitForEvents(1, &event); 
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); 
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); 
    printf("Memory x buffer read: %10.6f [ms]\n", (end - start) * 1.0e-6f); 
} 

/* Display Results */ 
int i; 
for (i = 0; i < group_size; i++) 
    for (size_t x = 0; x < memSize; x++) 
     printf("%c", tt[x]); 
printf("\n"); 

/* Finalization */ 
clFlush(commandQueue); 
clFinish(commandQueue); 
clReleaseKernel(kernel); 
clReleaseProgram(program); 
clReleaseMemObject(cmPinnedBufIn); 
clReleaseMemObject(cmPinnedBufOut); 
clReleaseMemObject(cmDevBufIn); 
clReleaseMemObject(cmDevBufOut); 
clReleaseCommandQueue(commandQueue); 
clReleaseContext(context); 

free(sourceStr); 

return EXIT_SUCCESS; 

內核變化的小寫字母爲大寫,但我的輸出是空的。當我像這樣靜態分配內核中的字符時:

__kernel void update(__global char *in, __global char *out) 
{ 
    size_t i; 
    for (i = 0; i < 10; i++) 
     out[i] = 'A' + i; 
} 

比結果還好。因此,我從輸入數據得出的結論是沒有正確地傳輸到內存,但爲什麼?任何人都可以幫助我嗎?

回答

1

在寫入映射緩衝區中的輸入後,您必須調用clEnqueueUnmapMemObject。請參閱OpenCL 1.1規範,5.4.2.1。

您的內核不訪問in,並且不依賴於線程索引get_global_id(0)。你可能想是這樣的:

size_t i = get_global_id(0) 
char c = in[i]; 
out[i] = (c>='a' && c<='z')?(c + 'A' - 'a'):c; 

要寫入字符數組中的OpenCL 1.0,您需要啓用byte_addressable_store extension.r

+0

感謝您的內核,但現在我只執行一個內核實例,用於測試問題。但是我試圖在數據初始化之後使用unmap函數,如下所示: ** clEnqueueUnmapMemObject(commandQueue,cmPinnedBufIn,cDataIn,0,NULL,NULL); ** 但是代碼仍然不起作用! – 2011-04-15 11:29:35

+0

未映射和內核執行必須被序列化。排隊它們之間的障礙,或使用事件來指定依賴關係。 – 2011-04-15 12:08:00

+0

好吧,我是這樣做的: ** ret = clEnqueueUnmapMemObject(commandQueue,cmPinnedBufIn,cDataIn,0,NULL,&event); assert(ret == CL_SUCCESS); clWaitForEvents(1,&event); ** 但仍不起作用。 – 2011-04-15 13:38:45

4

的爲內核的循環有一個最後的「;」,所以它的一個空的循環。

for (i = 0; i < 10; i++);