0
我正試圖優化這個內核。該內核的CPU版本比GPU版本快4倍。我預計GPU版本會更快。 這可能是因爲我們有很多內存訪問,這就是爲什麼我們的性能較差。我正在使用Intel HD 2500和OpenCL 1.2。優化opencl內核
的GPU內核是:
__kernel void mykernel(__global unsigned char *inp1,
__global unsigned char *inp2,
__global unsigned char *inp3,
__global unsigned char *inp4,
__global unsigned char *outp1,
__global unsigned char *outp2,
__global unsigned char *outp3,
__global unsigned char *outp4,
__global unsigned char *lut,
uint size
)
{
unsigned char x1, x2, x3, x4;
unsigned char y1, y2, y3, y4;
const int x = get_global_id(0);
const int y = get_global_id(1);
const int width = get_global_size(0);
const uint id = y * width + x;
x1 = inp1[id];
x2 = inp2[id];
x3 = inp3[id];
x4 = inp4[id];
y1 = (x1 & 0xff) | (x2>>2 & 0xaa) | (x3>>4 & 0x0d) | (x4>>6 & 0x02);
y2 = (x1<<2 & 0xff) | (x2 & 0xaa) | (x3>>2 & 0x0d) | (x4>>4 & 0x02);
y3 = (x1<<4 & 0xff) | (x2<<2 & 0xaa) | (x3 & 0x0d) | (x4>>2 & 0x02);
y4 = (x1<<6 & 0xff) | (x2<<4 & 0xaa) | (x3<<2 & 0x0d) | (x4 & 0x02);
// lookup table
y1 = lut[y1];
y2 = lut[y2];
y3 = lut[y3];
y4 = lut[y4];
outp1[id] = (y1 & 0xc0)
| ((y2 & 0xc0) >> 2)
| ((y3 & 0xc0) >> 4)
| ((y4 & 0xc0) >> 6);
outp2[id] = ((y1 & 0x30) << 2)
| (y2 & 0x30)
| ((y3 & 0x30) >> 2)
| ((y4 & 0x30) >> 4);
outp3[id] = ((y1 & 0x0c) << 4)
| ((y2 & 0x0c) << 2)
| (y3 & 0x0c)
| ((y4 & 0x0c) >> 2);
outp4[id] = ((y1 & 0x03) << 6)
| ((y2 & 0x03) << 4)
| ((y3 & 0x03) << 2)
| (y4 & 0x03);
}
我用:
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 1;
globalWorkSize[0] = X*Y; // X,Y define a data space of 15 - 20 MB
LocalWorkSize可以改變1之間 - 這是很奇怪的
for LocalWorkSize = 1 I have
CPU = 0.067Sec
GPU = 0.20Sec
for LocalWorkSize = 256 I have
CPU = 0.067Sec
GPU = 0.34Sec
。你能給我一些想法,爲什麼我得到這些奇怪的數字?你有沒有關於如何優化這個內核的提示?
我主要如下所示:
int main(int argc, char** argv)
{
int err,err1,j,i; // error code returned from api calls and other
clock_t start, end; // measuring performance variables
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program_ms_naive; // compute program
cl_kernel kernel_ms_naive; // compute kernel
// ... dynamically allocate arrays
// ... initialize arrays
cl_uint dev_cnt = 0;
clGetPlatformIDs(0, 0, &dev_cnt);
cl_platform_id platform_ids[100];
clGetPlatformIDs(dev_cnt, platform_ids, NULL);
// Connect to a compute device
err = clGetDeviceIDs(platform_ids[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a compute context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
commands = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute programs from the source file
program_ms_naive = clCreateProgramWithSource(context, 1, (const char **) &kernelSource_ms, NULL, &err);
// Build the programs executable
err = clBuildProgram(program_ms_naive, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel_ms_naive = clCreateKernel(program_ms_naive, "ms_naive", &err);
d_A1 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A1, &err);
d_A2 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A2, &err);
d_A3 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A3, &err);
d_A4 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_cpy/4, h_A4, &err);
d_lut = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 256, h_ltable, &err);
d_B1 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B2 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
d_B4 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_cpy/4, NULL, &err);
int size = YCOLUMNS*XROWS/4;
int size_b = size * 4;
err = clSetKernelArg(kernel_ms_naive, 0, sizeof(cl_mem), (void *)&(d_A1));
err |= clSetKernelArg(kernel_ms_naive, 1, sizeof(cl_mem), (void *)&(d_A2));
err |= clSetKernelArg(kernel_ms_naive, 2, sizeof(cl_mem), (void *)&(d_A3));
err |= clSetKernelArg(kernel_ms_naive, 3, sizeof(cl_mem), (void *)&(d_A4));
err |= clSetKernelArg(kernel_ms_naive, 4, sizeof(cl_mem), (void *)&d_B1);
err |= clSetKernelArg(kernel_ms_naive, 5, sizeof(cl_mem), (void *)&(d_B2));
err |= clSetKernelArg(kernel_ms_naive, 6, sizeof(cl_mem), (void *)&(d_B3));
err |= clSetKernelArg(kernel_ms_naive, 7, sizeof(cl_mem), (void *)&(d_B4));
err |= clSetKernelArg(kernel_ms_naive, 8, sizeof(cl_mem), (void *)&d_lut); //__global
err |= clSetKernelArg(kernel_ms_naive, 9, sizeof(cl_uint), (void *)&size_b);
size_t localWorkSize[1], globalWorkSize[1];
localWorkSize[0] = 256;
globalWorkSize[0] = XROWS*YCOLUMNS;
start = clock();
for (i=0;i< EXECUTION_TIMES;i++)
{
err1 = clEnqueueNDRangeKernel(commands, kernel_ms_naive, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
err = clFinish(commands);
}
end = clock();
return 0;
}
請發佈[MCVE]。 –
本地工作大小應該是256而不是1. 1 =最低的硬件佔用率和最低的性能。也許它需要8最低或8的倍數。 –
我知道我應該使用256.正如你可以在我的初始職位看到的本地工作組大小256我有一個更大的執行時間比本地工作組大小1 – Nick