1
我有一個算法,在GPU上執行兩階段並行減少以找到字符串中最小的元素。我知道如何讓它工作得更快,但我不知道它是什麼。關於如何調整這個內核來加速我的程序的任何想法?實際上不需要改變算法,可能還有其他的技巧。所有想法都歡迎。加速並行減少OpenCL
謝謝!
__kernel
void reduce(__global float* buffer,
__local float* scratch,
__const int length,
__global float* result) {
int global_index = get_global_id(0);
float accumulator = INFINITY
while (global_index < length) {
float element = buffer[global_index];
accumulator = (accumulator < element) ? accumulator : element;
global_index += get_global_size(0);
}
int local_index = get_local_id(0);
scratch[local_index] = accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
for(int offset = get_local_size(0)/2;
offset > 0;
offset = offset/2) {
if (local_index < offset) {
float other = scratch[local_index + offset];
float mine = scratch[local_index];
scratch[local_index] = (mine < other) ? mine : other;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
result[get_group_id(0)] = scratch[0];
}
}