如何在CUDA中卷積圖像

-1

我有一個關於CUDA中圖像卷積的問題。當我測試它與小矩陣（16 * 16）時，事情沒問題。但是對於較大的矩陣，運行時結果總是變化的。我認爲循環進入內核的問題是2。如何在CUDA中卷積圖像

__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv, 
        int img_width, const int img_height, 
        const int kernel_width, const int kernel_height) 
{ 

    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 

    float sum = 0; 
    for (int j = 0; j < kernel_height; j++) 
    { 
     for (int i = 0; i < kernel_width; i++) 
     { 
      int dX = x + i - kernel_width/2; 
      int dY = y + j - kernel_height/2; 

      if (dX < 0) 
       dX = 0; 

      if (dX >= img_width) 
       dX = img_width - 1; 

      if (dY < 0) 
       dY = 0; 

      if (dY >= img_height) 
       dY = img_height - 1; 


      const int idMat = j * kernel_width + i; 
      const int idPixel = dY * img_width + dX; 
      sum += (float)input[idPixel] * kernelConv[idMat]; 
     } 
    } 

    const int idOut = y * img_width + x; 
    out[idOut] = abs(sum); 

} 

    void image_convolution(float * input,float* output, int img_height, int img_width) 
{ 
    int kernel_height = 3; 
    int kernel_width = 3; 
    float kernel[] ={ 0,-0.25,0, 
        -0.25,1,-0.25, 
         0,-0.25,0 
        }; 
    float * mask = new float[kernel_height*kernel_width]; 
    for (int i = 0; i < kernel_height*kernel_width; i++) 
    { 
     mask[i] = kernel[i]; 
    } 

    float * d_input, * d_output, * d_kernel; 
    cudaMalloc(&d_input, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_output, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float)); 

    cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice); 
    dim3 blocksize(16,16); 
    dim3 gridsize; 
    gridsize.x=(img_width+blocksize.x-1)/blocksize.x; 
    gridsize.y=(img_height+blocksize.y-1)/blocksize.y; 
    image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height); 
    cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost); 

    for (int i=0; i < img_width*img_height; i++) 
    { 
     printf("%d, ",(int)output[i]); 
    } 
    printf("\n\n"); 
}

這裏是我的結果，我用24 * 24的圖像進行測試，我運行2的時候，我也寫簡單的函數來比較的輸出。

這裏是結果，當我比較輸出，有32型動物，在指數240，241 ....

來源

2016-07-31 Jim

@RobertCrovella對不起，我更新了代碼和輸出圖片，請給我一些建議。 – Jim

如果您想在問題中包含基於文本的數據，則建議您複製並粘貼實際文本（並對其進行格式化！），而不是將文本圖片粘貼到問題中。 –

你已經在你的程序做了一個相當常見的錯誤。當你創建線程的這樣一個網格：

dim3 blocksize(16,16); 
dim3 gridsize; 
gridsize.x=(img_width+blocksize.x-1)/blocksize.x; 
gridsize.y=(img_height+blocksize.y-1)/blocksize.y;

特意要創建（通常）額外在每個維度線，以完全覆蓋問題空間（即圖像尺寸）。這沒有什麼錯。

但是，這意味着我們將啓動額外的線程，它們位於有效圖像維度之外。我們必須確保這些線程沒有什麼。通常的做法是向內核添加線程檢查，以便有效圖像維度外的線程不執行任何操作。這裏有一個修改過的內核和整個例子顯示出變化：

$ cat t1219.cu 
#include <iostream> 
#include <cstdlib> 

const int iw = 1025; 
const int ih = 1025; 
const int rng = 10; 

__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv, 
        int img_width, const int img_height, 
        const int kernel_width, const int kernel_height) 
{ 

    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 
    if ((x < img_width) && (y < img_height)){ // thread check 
     float sum = 0; 
     for (int j = 0; j < kernel_height; j++) 
     { 
     for (int i = 0; i < kernel_width; i++) 
     { 
      int dX = x + i - kernel_width/2; 
      int dY = y + j - kernel_height/2; 

      if (dX < 0) 
       dX = 0; 

      if (dX >= img_width) 
       dX = img_width - 1; 

      if (dY < 0) 
       dY = 0; 

      if (dY >= img_height) 
       dY = img_height - 1; 


      const int idMat = j * kernel_width + i; 
      const int idPixel = dY * img_width + dX; 
      sum += (float)input[idPixel] * kernelConv[idMat]; 
     } 
     } 

     const int idOut = y * img_width + x; 
     out[idOut] = abs(sum); 
    } 

} 

    void image_convolution(float * input,float* output, int img_height, int img_width) 
{ 
    int kernel_height = 3; 
    int kernel_width = 3; 
    float kernel[] ={ 0,-0.25,0, 
        -0.25,1,-0.25, 
         0,-0.25,0 
        }; 
    float * mask = new float[kernel_height*kernel_width]; 
    for (int i = 0; i < kernel_height*kernel_width; i++) 
    { 
     mask[i] = kernel[i]; 
    } 

    float * d_input, * d_output, * d_kernel; 
    cudaMalloc(&d_input, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_output, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float)); 

    cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice); 
    dim3 blocksize(16,16); 
    dim3 gridsize; 
    gridsize.x=(img_width+blocksize.x-1)/blocksize.x; 
    gridsize.y=(img_height+blocksize.y-1)/blocksize.y; 
    image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height); 
    cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost); 
} 

int main(){ 

    float *in, *out; 
    int is = ih*iw; 
    in = new float[is]; 
    out = new float[is]; 
    for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;} 
    image_convolution(in,out, ih, iw); 
    for (int iy = 1; iy < ih-1; iy++) 
    for (int ix = 1; ix < iw-1; ix++){ 
     float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]); 
     if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}} 
    return 0; 
} 
$ nvcc -o t1219 t1219.cu 
$ cuda-memcheck ./t1219 
========= CUDA-MEMCHECK 
========= ERROR SUMMARY: 0 errors 
$

對於圖像尺寸要（這是真的爲我以前的測試案例）塊大小（16,16）的整倍數的這個問題不會出現 - 代碼將正常工作。對於所有其他測試用例，我們需要這樣的線程檢查。

來源

2016-07-31 22:32:37

感謝您的幫助，我解決了問題:) – Jim

如何在CUDA中卷積圖像

回答

相關問題