0
我創建了一個無分支中位數3x3過濾器,並且在高分辨率圖像(約4K下3K)時獲得了每通過200毫米左右的數據,我認爲我可以獲得如果我創建了內核來利用工作組,那就更好了。不幸的是,該工具告訴我我做錯了什麼,我想知道。OpenCL使用本地工作組的圖像過濾器
#define wgs 16
//Work group size
#define cas3(a, b) \
do { \
float4 x = a; \
int4 c = a> b; \
a.s012 = select(b, a, c).s012; \
b.s012 = select(x, b, c).s012; \
} while (0)
__kernel void median3x3_rgb(read_only image2d_t src, write_only image2d_t dst) {
int gx = get_global_id(0), gy = get_global_id(1);
int lx = get_local_id(0), ly = get_local_id(1);
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
if ((gx >= get_image_width(dst)) | (gy >= get_image_height(dst)))
return;
__local float4 la[wgs+2][wgs+2];
la[lx+1][ly+1] = read_imagef(src,smp,(int2)(gx,gy));
if(lx == 0){
la[lx][ly+1] = read_imagef(src,smp,(int2)(gx-1,gy));
if(ly == 0)
la[lx+1][ly] = read_imagef(src,smp,(int2)(gx-1,gy));
if(ly == wgs)
la[lx+1][ly+2] = read_imagef(src,smp,(int2)(gx+1,gy));
}
else if(lx == wgs){
la[lx+2][ly+1] = read_imagef(src,smp,(int2)(gx+1,gy));
if(ly == 0)
la[lx+1][ly] = read_imagef(src,smp,(int2)(gx-1,gy));
if(ly == wgs)
la[lx+1][ly+2] = read_imagef(src,smp,(int2)(gx+1,gy));
}
else if(ly == 0)
la[lx+1][ly] = read_imagef(src,smp,(int2)(gx-1,gy));
else if(ly == wgs)
la[lx+1][ly+2] = read_imagef(src,smp,(int2)(gx+1,gy));
barrier(CLK_LOCAL_MEM_FENCE); //----------------------- mem barrier
float4 s0 = la[ lx-1][ly-1 ];
float4 s1 = la[ lx ][ly-1 ];
float4 s2 = la[ lx+1][ly-1 ];
float4 s3 = la[ lx-1][ly ];
float4 s4 = la[ lx ][ly ];
float4 s5 = la[ lx+1][ly ];
float4 s6 = la[ lx-1][ly+1 ];
float4 s7 = la[ lx ][ly+1 ];
float4 s8 = la[ lx+1][ly+1 ];
啓動分揀 這方面的工作得很好不用擔心它
// stage0
cas3(s1, s2);
cas3(s4, s5);
cas3(s7, s8);
// 1
cas3(s0, s1);
cas3(s3, s4);
cas3(s6, s7);
// 2
cas3(s1, s2);
cas3(s4, s5);
cas3(s7, s8);
// 3/4
cas3(s3, s6);
cas3(s4, s7);
cas3(s5, s8);
cas3(s0, s3);
cas3(s1, s4);
cas3(s2, s5);
cas3(s3, s6);
cas3(s4, s7);
cas3(s1, s3);
cas3(s2, s6);
cas3(s2, s3);
cas3(s4, s6);
cas3(s3, s4);
結束分揀
write_imagef(dst, (int2) (gx, gy), s4);
}
我在i7和660m上運行它 –