OpenCL使用本地工作組的圖像過濾器

我創建了一個無分支中位數3x3過濾器，並且在高分辨率圖像（約4K下3K）時獲得了每通過200毫米左右的數據，我認爲我可以獲得如果我創建了內核來利用工作組，那就更好了。不幸的是，該工具告訴我我做錯了什麼，我想知道。OpenCL使用本地工作組的圖像過濾器

#define wgs 16 
//Work group size 
#define cas3(a, b)        \ 
    do {           \ 
      float4 x = a;       \ 
      int4 c = a> b;       \ 
      a.s012 = select(b, a, c).s012;   \ 
      b.s012 = select(x, b, c).s012;   \ 
    } while (0) 

__kernel void median3x3_rgb(read_only image2d_t src, write_only image2d_t dst) { 
    int gx = get_global_id(0), gy = get_global_id(1); 
    int lx = get_local_id(0), ly = get_local_id(1); 
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; 

    if ((gx >= get_image_width(dst)) | (gy >= get_image_height(dst))) 
      return; 

__local float4 la[wgs+2][wgs+2]; 

la[lx+1][ly+1] = read_imagef(src,smp,(int2)(gx,gy));  

if(lx == 0){ 
    la[lx][ly+1] = read_imagef(src,smp,(int2)(gx-1,gy)); 
    if(ly == 0) 
     la[lx+1][ly] = read_imagef(src,smp,(int2)(gx-1,gy)); 
    if(ly == wgs) 
     la[lx+1][ly+2] = read_imagef(src,smp,(int2)(gx+1,gy)); 
} 
else if(lx == wgs){ 
    la[lx+2][ly+1] = read_imagef(src,smp,(int2)(gx+1,gy)); 
    if(ly == 0) 
     la[lx+1][ly] = read_imagef(src,smp,(int2)(gx-1,gy)); 
    if(ly == wgs) 
     la[lx+1][ly+2] = read_imagef(src,smp,(int2)(gx+1,gy)); 
} 
else if(ly == 0) 
    la[lx+1][ly] = read_imagef(src,smp,(int2)(gx-1,gy)); 
else if(ly == wgs) 
    la[lx+1][ly+2] = read_imagef(src,smp,(int2)(gx+1,gy)); 


barrier(CLK_LOCAL_MEM_FENCE); //----------------------- mem barrier 

    float4 s0 = la[ lx-1][ly-1 ]; 
    float4 s1 = la[ lx ][ly-1 ]; 
    float4 s2 = la[ lx+1][ly-1 ]; 
    float4 s3 = la[ lx-1][ly ]; 
    float4 s4 = la[ lx ][ly ]; 
    float4 s5 = la[ lx+1][ly ]; 
    float4 s6 = la[ lx-1][ly+1 ]; 
    float4 s7 = la[ lx ][ly+1 ]; 
    float4 s8 = la[ lx+1][ly+1 ];

啓動分揀這方面的工作得很好不用擔心它

// stage0 
    cas3(s1, s2); 
    cas3(s4, s5); 
    cas3(s7, s8); 

    // 1 
    cas3(s0, s1); 
    cas3(s3, s4); 
    cas3(s6, s7); 

    // 2 
    cas3(s1, s2); 
    cas3(s4, s5); 
    cas3(s7, s8); 

    // 3/4 
    cas3(s3, s6); 
    cas3(s4, s7); 
    cas3(s5, s8); 
    cas3(s0, s3); 

    cas3(s1, s4); 
    cas3(s2, s5); 
    cas3(s3, s6); 

    cas3(s4, s7); 
    cas3(s1, s3); 

    cas3(s2, s6); 
    cas3(s2, s3); 
    cas3(s4, s6); 

    cas3(s3, s4);

結束分揀

write_imagef(dst, (int2) (gx, gy), s4); 
}

來源

2013-06-04 Mercutio Calviary

一對夫婦的建議，隨機試驗：

如果以後出現障礙，應儘早使用提前退貨：全部工作組中的工作項應該執行屏障，並且如果某些工作項已返回，則內核將掛起。
紋理緩存在這種情況下通常至少與本地內存一樣快。嘗試用簡單的read_imagef調用替換s0，...，s8初始化。然後嘗試不同的工作組維度。

您在運行什麼硬件？

來源

2013-06-05 05:40:36

我在i7和660m上運行它 –

OpenCL使用本地工作組的圖像過濾器

回答

相關問題