越來越並行弗洛伊德Warshall算法的輸出錯誤OpenCL中

#include <stdio.h> 
#include <stdlib.h> 
#include <iostream> 

/*#ifdef __APPLE__ 
#include <OpenCL/opencl.h> 
#else*/ 
#include <CL/cl.h> 
//#endif 

#define DATA_SIZE 16 

using namespace std; 

const char *ProgramSource = 
"__kernel void floydWarshallPass(__global uint * pathDistanceBuffer,const unsigned int numNodes, __global uint * result, const unsigned int pass)\n"\ 
"{\n"\ 
    "int xValue = get_global_id(0);\n"\ 
    "int yValue = get_global_id(1);\n"\ 
    "int k = pass;\n"\ 
    "int oldWeight = pathDistanceBuffer[yValue * 4 + xValue];\n"\ 
    "int tempWeight = (pathDistanceBuffer[yValue * 4 + k] + pathDistanceBuffer[k * 4 + xValue]);\n"\ 
    "if (tempWeight < oldWeight)\n"\ 
    "{\n"\ 
     "pathDistanceBuffer[yValue * 4 + xValue] = tempWeight;\n"\ 
     "result[yValue * 4 + xValue] = tempWeight;\n"\ 
    "}\n"\ 
"}\n"\ 
"\n"; 


int main(void) 
{ 
cl_context context; 
cl_context_properties properties[3]; 
cl_kernel kernel; 
cl_command_queue command_queue; 
cl_program program; 
cl_int err; 
cl_uint num_of_platforms=0; 
cl_platform_id platform_id; 
cl_device_id device_id; 
cl_uint num_of_devices=0; 
cl_mem inputA, inputB, output; 
cl_int numNodes; 
size_t global; 

float inputDataA[16] = {0,2,3,4,5,0,7,8,9,10,0,12,13,14,15,0}; 
float results[16]={0}; 

int i,j; 
numNodes = 16; 



if(clGetPlatformIDs(1, &platform_id, &num_of_platforms) != CL_SUCCESS) 
{ 
    printf("Unable to get platform id\n"); 
    return 1; 
} 


// try to get a supported GPU device 
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &num_of_devices) != CL_SUCCESS) 
{ 
printf("Unable to get device_id\n"); 
return 1; 
} 

// context properties list - must be terminated with 0 
properties[0]= CL_CONTEXT_PLATFORM; 
properties[1]= (cl_context_properties) platform_id; 
properties[2]= 0; 

// create a context with the GPU device 
context = clCreateContext(properties,1,&device_id,NULL,NULL,&err); 

// create command queue using the context and device 
command_queue = clCreateCommandQueue(context, device_id, 0, &err); 

// create a program from the kernel source code 
program = clCreateProgramWithSource(context,1,(const char **) &ProgramSource, NULL, &err); 

// compile the program 
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) 
{ 
printf("Error building program\n"); 
return 1; 
} 

// specify which kernel from the program to execute 
kernel = clCreateKernel(program, "floydWarshallPass", &err); 

// create buffers for the input and ouput 

inputA = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL); 
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * DATA_SIZE, NULL, NULL); 

// load data into the input buffer 
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL); 
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL); 

// set the argument list for the kernel command 
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); 
clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes); 
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); 

global=DATA_SIZE; 

// enqueue the kernel command for execution 
for(cl_uint sh=0; sh<16; sh++) 
{ 
clSetKernelArg(kernel, 3, sizeof(cl_uint), (void *)&sh); 
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); 
//clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float)*DATA_SIZE, results, 0, NULL, NULL); 

//clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL); 
//clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL); 
//clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); 
//clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&numNodes); 
//clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); 
clFinish(command_queue); 

} 
clFinish(command_queue); 
// copy the results from out of the output buffer 
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) *DATA_SIZE, results, 0, NULL, NULL); 

// print the results 
printf("output: "); 

for(i=0;i<16; i++) 
{ 
printf("%f ",results[i]); 
} 

// cleanup - release OpenCL resources 
clReleaseMemObject(inputA); 
//clReleaseMemObject(inputB); 
clReleaseMemObject(output); 
clReleaseProgram(program); 
clReleaseKernel(kernel); 
clReleaseCommandQueue(command_queue); 
clReleaseContext(context); 

return 0; 

}

我得到-0.00000輸出爲每個節點。越來越並行弗洛伊德Warshall算法的輸出錯誤OpenCL中

P.S我正在CL_DEVICE_TYPE_CPU上運行我的代碼，因爲它在GPU上給出了無法獲取設備ID的錯誤。

請給出一些關於如何獲得正確輸出的指導。

來源

2014-12-04 Shubham Gupta

我認爲你的問題有點過於寬泛，你應該縮小你的代碼的範圍。我會盡力幫助你解決在代碼中發現的一些錯誤，但我沒有調試或編譯它，所以我在這裏描述的這些問題只是你開始看的東西。

你爲什麼要在內核中調用參數爲1的get_global_id？回你的clEnqueueNDRangeKernel你指定你的工作項尺寸只有一個，所以你的get_global_id查詢 不存在的尺寸。當您使用sizeof(float)測量數據類型的大小

int id = get_global_id(0); 
int x = id % size->width; 
int y = id/size->height;

注意的是：如果你想翻譯一個維座標變換成兩個座標，你應該使用改造，如下面：它們在OpenCL實現中的大小可能不一樣。改爲使用sizeof(cl_float)。
也許你沒有獲得任何GPU，因爲你的計算機上沒有安裝正確的驅動程序。轉到GPU供應商網站，查找OpenCL的運行時驅動程序。

看看從OpenCL規範

這些頁面

來源

2014-12-04 22:32:45

越來越並行弗洛伊德Warshall算法的輸出錯誤OpenCL中

回答

相關問題