2013-06-01 70 views
2

我正在嘗試使用窗口x*y做一箇中值過濾器,其中xy是奇數並且程序的參數。CUDA中的2D中值過濾:如何有效地將全局內存複製到共享內存

我的想法是先看看有多少線程可以在一個單一的塊執行和多少共享內存我有avaliable,像這樣:

void cudaInit(int imgX, int imgY, int kx, int ky, int* cudaVars){ 
     int device; 
     int deviceCount; 
     cudaDeviceProp deviceProp; 

      cudaGetDevice(&device); 
      cudaGetDeviceProperties(&deviceProp, device); 
     int kxMed = kx/2; 
     int kyMed = ky/2; 
     int n = deviceProp.maxThreadsPerBlock; 
     while(f(n,kxMed,kyMed)>deviceProp.sharedMemPerBlock){ 
      n = n/2; 
     } 

     cudaVars[0] = n; 
     cudaVars[1] = imgX/cudaVars[0]; 
     cudaVars[2] = imgY/cudaVars[0]; 
    } 
    } 



void mediaFilter_gpuB(uchar4* h_img,int width, int height, int kx, int ky){ 

    assert(h_img!=NULL && width!=0 && height!=0); 
     int dev=0; 
    cudaDeviceProp deviceProp; 
    //DEVICE 
    uchar4* d_img; 
    uchar4* d_buf; 

    int cudaVars[3]={0}; 
    cudaInit(width,height,kx,ky,cudaVars); 
checkCudaErrors(cudaMalloc((void**) &(d_img), width*height*sizeof(unsigned char)*4)); 
    checkCudaErrors(cudaMalloc((void**) &(d_buf), width*height*sizeof(unsigned char)*4)); 

    cudaGetDevice(&dev); 
    cudaGetDeviceProperties(&deviceProp,dev); 
    checkCudaErrors(cudaMemcpy(d_img, h_img, width*height*sizeof(uchar4), cudaMemcpyHostToDevice)); 

    dim3 dimGrid(cudaVars[1],cudaVars[2],1); 
    dim3 threads(cudaVars[0],1,1); 
    mediaFilterB<<<dimGrid,threads,f(cudaVars[0],kx/2,ky/2)>>>(d_buf,d_img,width,height, kx,ky,cudaVars[0]); 

    checkCudaErrors(cudaMemcpy(h_img, d_buf, width*height*sizeof(uchar4), cudaMemcpyDeviceToHost)); 
    checkCudaErrors(cudaFree(d_img)); 
    checkCudaErrors(cudaFree(d_buf)); 

} 
__device__ void fillSmem(int* sMem, uchar4* buf, int width, int height, int kx, int ky){ 
    int kyMed=ky/2; 
    int kxMed=kx/2; 
    int sWidth = 2*kxMed+gridDim.x; 
    int sHeight =2*kyMed+gridDim.x; 
    int X = blockIdx.x*gridDim.x+threadIdx.x; 
    int Y = blockIdx.y*gridDim.y; 
    int j=0; 
    while(threadIdx.x+j < sHeight){ 
     for(int i=0;i<sWidth;i++){ 
      sMem[threadIdx.x*gridDim.x+gridDim.x*j+i] = buf[X + i + (threadIdx.x + Y)*width + j*width].x; 
     } 
     j++; 
    } 
} 

就目前而言,在功能mediaFilterB,我只拷貝全局內存共享內存,但這需要很長時間,即在8000*8000像素的圖像中大約需要5秒。另一方面,沒有CUDA的順序算法需要23秒來計算圖像的中值濾波器。

我知道我在做過程中將全局內存複製到共享內存時出錯,並且我的算法效率很低,但我不知道如何才能更正它。

+0

kx和ky是x和y窗口的大小 – user2443862

+3

您忽略了對fillSmem()的調用。共享內存在塊中的所有線程之間共享。將全局複製到共享內存的最有效方法是,通過複製一小段全局到共享內存來啓動所有線程,從而使內存訪問得以合併。也就是說,塊中的線程協同工作,將數據塊處理的數據寫入共享內存,然後執行_syncthreads(),塊中的所有線程開始處理共享內存中的數據。 –

回答

3

我對這個問題提供了一個答案,將其從無人答覆的列表中刪除。

關於如何使用共享存儲器,以改善與CUDA中值濾波的經典例子是由Accelereyes開發的代碼和可供下載從下面的柱:

Median Filtering: CUDA tips and tricks

的想法是分配(BLOCK_WIDTH+2)x(BLOCK_HEIGHT+2)大小的共享內存。第一步,外部元件歸零。只有這些元素對應於真實圖像元素時,這些元素纔會被全局內存值填充,否則它們將保持爲零填充。

爲了方便起見,我在下面提供了一個完整的工作代碼。

#include <iostream> 
#include <fstream> 

using namespace std; 

#define BLOCK_WIDTH 16 
#define BLOCK_HEIGHT 16 

/*******************/ 
/* iDivUp FUNCTION */ 
/*******************/ 
int iDivUp(int a, int b){ return ((a % b) != 0) ? (a/b + 1) : (a/b); } 

/********************/ 
/* CUDA ERROR CHECK */ 
/********************/ 
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

/**********************************************/ 
/* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */ 
/**********************************************/ 
__global__ void Optimized_Kernel_Function_shared(unsigned short *Input_Image, unsigned short *Output_Image, int Image_Width, int Image_Height) 
{ 
    const int tx_l = threadIdx.x;       // --- Local thread x index 
    const int ty_l = threadIdx.y;       // --- Local thread y index 

    const int tx_g = blockIdx.x * blockDim.x + tx_l;  // --- Global thread x index 
    const int ty_g = blockIdx.y * blockDim.y + ty_l;  // --- Global thread y index 

    __shared__ unsigned short smem[BLOCK_WIDTH+2][BLOCK_HEIGHT+2]; 

    // --- Fill the shared memory border with zeros 
    if (tx_l == 0)      smem[tx_l] [ty_l+1] = 0; // --- left border 
    else if (tx_l == BLOCK_WIDTH-1)  smem[tx_l+2][ty_l+1] = 0; // --- right border 
    if (ty_l == 0) {     smem[tx_l+1][ty_l]  = 0; // --- upper border 
     if (tx_l == 0)     smem[tx_l] [ty_l]  = 0; // --- top-left corner 
     else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l]  = 0; // --- top-right corner 
     } else if (ty_l == BLOCK_HEIGHT-1) {smem[tx_l+1][ty_l+2] = 0; // --- bottom border 
     if (tx_l == 0)     smem[tx_l] [ty_l+2] = 0; // --- bottom-left corder 
     else if (tx_l == BLOCK_WIDTH-1) smem[tx_l+2][ty_l+2] = 0; // --- bottom-right corner 
    } 

    // --- Fill shared memory 
                    smem[tx_l+1][ty_l+1] =       Input_Image[ty_g*Image_Width + tx_g];  // --- center 
    if ((tx_l == 0)&&((tx_g > 0)))          smem[tx_l] [ty_l+1] = Input_Image[ty_g*Image_Width + tx_g-1];  // --- left border 
    else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))   smem[tx_l+2][ty_l+1] = Input_Image[ty_g*Image_Width + tx_g+1];  // --- right border 
    if ((ty_l == 0)&&(ty_g > 0)) {          smem[tx_l+1][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g]; // --- upper border 
      if ((tx_l == 0)&&((tx_g > 0)))         smem[tx_l] [ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- top-left corner 
      else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))  smem[tx_l+2][ty_l] = Input_Image[(ty_g-1)*Image_Width + tx_g+1]; // --- top-right corner 
     } else if ((ty_l == BLOCK_HEIGHT-1)&&(ty_g < Image_Height - 1)) { smem[tx_l+1][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g]; // --- bottom border 
     if ((tx_l == 0)&&((tx_g > 0)))         smem[tx_l] [ty_l+2] = Input_Image[(ty_g-1)*Image_Width + tx_g-1]; // --- bottom-left corder 
     else if ((tx_l == BLOCK_WIDTH-1)&&(tx_g < Image_Width - 1))  smem[tx_l+2][ty_l+2] = Input_Image[(ty_g+1)*Image_Width + tx_g+1]; // --- bottom-right corner 
    } 
    __syncthreads(); 

    // --- Pull the 3x3 window in a local array 
    unsigned short v[9] = { smem[tx_l][ty_l], smem[tx_l+1][ty_l],  smem[tx_l+2][ty_l], 
          smem[tx_l][ty_l+1], smem[tx_l+1][ty_l+1], smem[tx_l+2][ty_l+1], 
          smem[tx_l][ty_l+2], smem[tx_l+1][ty_l+2], smem[tx_l+2][ty_l+2] };  

    // --- Bubble-sort 
    for (int i = 0; i < 5; i++) { 
     for (int j = i + 1; j < 9; j++) { 
      if (v[i] > v[j]) { // swap? 
       unsigned short tmp = v[i]; 
       v[i] = v[j]; 
       v[j] = tmp; 
      } 
     } 
    } 

    // --- Pick the middle one 
    Output_Image[ty_g*Image_Width + tx_g] = v[4]; 
} 

/********/ 
/* MAIN */ 
/********/ 
int main() 
{ 
    const int Image_Width = 1580; 
    const int Image_Height = 1050; 

    // --- Open data file 
    ifstream is;   is.open("C:\\Users\\user\\Documents\\Project\\Median_Filter\\Release\\Image_To_Be_Filtered.raw", ios::binary); 

    // --- Get file length 
    is.seekg(0, ios::end); 
    int dataLength = is.tellg(); 
    is.seekg(0, ios::beg); 

    // --- Read data from file and close file 
    unsigned short* Input_Image_Host = new unsigned short[dataLength * sizeof(char)/sizeof(unsigned short)]; 
    is.read((char*)Input_Image_Host,dataLength); 
    is.close(); 

    // --- CUDA warm up 
    unsigned short *forFirstCudaMalloc; gpuErrchk(cudaMalloc((void**)&forFirstCudaMalloc, dataLength * sizeof(unsigned short))); 
    gpuErrchk(cudaFree(forFirstCudaMalloc)); 

    // --- Allocate host and device memory spaces 
    unsigned short *Output_Image_Host = (unsigned short *)malloc(dataLength); 
    unsigned short *Input_Image; gpuErrchk(cudaMalloc((void**)&Input_Image, dataLength * sizeof(unsigned short))); 
    unsigned short *Output_Image; gpuErrchk(cudaMalloc((void**)&Output_Image, dataLength * sizeof(unsigned short))); 

    // --- Copy data from host to device 
    gpuErrchk(cudaMemcpy(Input_Image, Input_Image_Host, dataLength, cudaMemcpyHostToDevice));// copying Host Data To Device Memory For Filtering 

    // --- Grid and block sizes 
    const dim3 grid (iDivUp(Image_Width, BLOCK_WIDTH), iDivUp(Image_Height, BLOCK_HEIGHT), 1);  
    const dim3 block(BLOCK_WIDTH, BLOCK_HEIGHT, 1); 

    /**********************************************/ 
    /* KERNEL WITH OPTIMIZED USE OF SHARED MEMORY */ 
    /**********************************************/ 

    cudaFuncSetCacheConfig(Optimized_Kernel_Function_shared, cudaFuncCachePreferShared); 
    Optimized_Kernel_Function_shared<<<grid,block>>>(Input_Image, Output_Image, Image_Width, Image_Height); 
    gpuErrchk(cudaPeekAtLastError()); 
    gpuErrchk(cudaDeviceSynchronize()); 

    // --- Copy results back to the host 
    gpuErrchk(cudaMemcpy(Output_Image_Host, Output_Image, dataLength, cudaMemcpyDeviceToHost)); 

    // --- Open results file, write results and close the file 
    ofstream of2;   of2.open("C:\\Users\\angelo\\Documents\\Project\\Median_Filter\\Release\\Filtered_Image.raw", ios::binary); 
    of2.write((char*)Output_Image_Host, dataLength); 
    of2.close(); 

    cout << "\n Press Any Key To Exit..!!"; 
    gpuErrchk(cudaFree(Input_Image)); 

    delete Input_Image_Host; 
    delete Output_Image_Host; 

    return 0; 
} 
+0

@RobertCrovella謝謝Robert,我解決了你提出的問題。 – JackOLantern

+0

我有一個類似的實現,但CPU和GPU的結果不匹配。你能看到thsi鏈接:http://stackoverflow.com/questions/40062053/cuda-gpu-result-different-from-cpu?noredirect=1#comment67400015_40062053 – gpuguy