2016-07-31 111 views
-1

我有一個關於CUDA中圖像卷積的問題。當我測試它與小矩陣(16 * 16)時,事情沒問題。但是對於較大的矩陣,運行時結果總是變化的。 我認爲循環進入內核的問題是2。如何在CUDA中卷積圖像

__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv, 
        int img_width, const int img_height, 
        const int kernel_width, const int kernel_height) 
{ 

    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 

    float sum = 0; 
    for (int j = 0; j < kernel_height; j++) 
    { 
     for (int i = 0; i < kernel_width; i++) 
     { 
      int dX = x + i - kernel_width/2; 
      int dY = y + j - kernel_height/2; 

      if (dX < 0) 
       dX = 0; 

      if (dX >= img_width) 
       dX = img_width - 1; 

      if (dY < 0) 
       dY = 0; 

      if (dY >= img_height) 
       dY = img_height - 1; 


      const int idMat = j * kernel_width + i; 
      const int idPixel = dY * img_width + dX; 
      sum += (float)input[idPixel] * kernelConv[idMat]; 
     } 
    } 

    const int idOut = y * img_width + x; 
    out[idOut] = abs(sum); 

} 

    void image_convolution(float * input,float* output, int img_height, int img_width) 
{ 
    int kernel_height = 3; 
    int kernel_width = 3; 
    float kernel[] ={ 0,-0.25,0, 
        -0.25,1,-0.25, 
         0,-0.25,0 
        }; 
    float * mask = new float[kernel_height*kernel_width]; 
    for (int i = 0; i < kernel_height*kernel_width; i++) 
    { 
     mask[i] = kernel[i]; 
    } 

    float * d_input, * d_output, * d_kernel; 
    cudaMalloc(&d_input, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_output, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float)); 

    cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice); 
    dim3 blocksize(16,16); 
    dim3 gridsize; 
    gridsize.x=(img_width+blocksize.x-1)/blocksize.x; 
    gridsize.y=(img_height+blocksize.y-1)/blocksize.y; 
    image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height); 
    cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost); 

    for (int i=0; i < img_width*img_height; i++) 
    { 
     printf("%d, ",(int)output[i]); 
    } 
    printf("\n\n"); 
} 

這裏是我的結果,我用24 * 24的圖像進行測試,我運行2的時候,我也寫簡單的函數來比較的輸出。

enter image description here

這裏是結果,當我比較輸出,有32型動物,在指數240,241 .... enter image description here

+0

@RobertCrovella對不起,我更新了代碼和輸出圖片,請給我一些建議。 – Jim

+0

如果您想在問題中包含基於文本的數據,則建議您複製並粘貼實際文本(並對其進行格式化!),而不是將文本圖片粘貼到問題中。 –

回答

1

你已經在你的程序做了一個相當常見的錯誤。當你創建線程的這樣一個網格:

dim3 blocksize(16,16); 
dim3 gridsize; 
gridsize.x=(img_width+blocksize.x-1)/blocksize.x; 
gridsize.y=(img_height+blocksize.y-1)/blocksize.y; 

特意要創建(通常)額外在每個維度線,以完全覆蓋問題空間(即圖像尺寸)。這沒有什麼錯。

但是,這意味着我們將啓動額外的線程,它們位於有效圖像維度之外。我們必須確保這些線程沒有什麼。通常的做法是向內核添加線程檢查,以便有效圖像維度外的線程不執行任何操作。這裏有一個修改過的內核和整個例子顯示出變化:

$ cat t1219.cu 
#include <iostream> 
#include <cstdlib> 

const int iw = 1025; 
const int ih = 1025; 
const int rng = 10; 

__global__ void image_convolution_kernel(float *input, float *out, float *kernelConv, 
        int img_width, const int img_height, 
        const int kernel_width, const int kernel_height) 
{ 

    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 
    if ((x < img_width) && (y < img_height)){ // thread check 
     float sum = 0; 
     for (int j = 0; j < kernel_height; j++) 
     { 
     for (int i = 0; i < kernel_width; i++) 
     { 
      int dX = x + i - kernel_width/2; 
      int dY = y + j - kernel_height/2; 

      if (dX < 0) 
       dX = 0; 

      if (dX >= img_width) 
       dX = img_width - 1; 

      if (dY < 0) 
       dY = 0; 

      if (dY >= img_height) 
       dY = img_height - 1; 


      const int idMat = j * kernel_width + i; 
      const int idPixel = dY * img_width + dX; 
      sum += (float)input[idPixel] * kernelConv[idMat]; 
     } 
     } 

     const int idOut = y * img_width + x; 
     out[idOut] = abs(sum); 
    } 

} 

    void image_convolution(float * input,float* output, int img_height, int img_width) 
{ 
    int kernel_height = 3; 
    int kernel_width = 3; 
    float kernel[] ={ 0,-0.25,0, 
        -0.25,1,-0.25, 
         0,-0.25,0 
        }; 
    float * mask = new float[kernel_height*kernel_width]; 
    for (int i = 0; i < kernel_height*kernel_width; i++) 
    { 
     mask[i] = kernel[i]; 
    } 

    float * d_input, * d_output, * d_kernel; 
    cudaMalloc(&d_input, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_output, img_width*img_height*sizeof(float)); 
    cudaMalloc(&d_kernel, kernel_height*kernel_width*sizeof(float)); 

    cudaMemcpy(d_input, input, img_width*img_height*sizeof(float), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_kernel, mask, kernel_height*kernel_width*sizeof(float), cudaMemcpyHostToDevice); 
    dim3 blocksize(16,16); 
    dim3 gridsize; 
    gridsize.x=(img_width+blocksize.x-1)/blocksize.x; 
    gridsize.y=(img_height+blocksize.y-1)/blocksize.y; 
    image_convolution_kernel<<<gridsize,blocksize>>>(d_input,d_output,d_kernel,img_width,img_height,kernel_width,kernel_height); 
    cudaMemcpy(output, d_output, img_width*img_height*sizeof(float), cudaMemcpyDeviceToHost); 
} 

int main(){ 

    float *in, *out; 
    int is = ih*iw; 
    in = new float[is]; 
    out = new float[is]; 
    for (int i = 0; i < is; i++) {in[i] = rand()%rng; out[i] = -1;} 
    image_convolution(in,out, ih, iw); 
    for (int iy = 1; iy < ih-1; iy++) 
    for (int ix = 1; ix < iw-1; ix++){ 
     float temp = abs(-0.25 * (in[iy*iw + ix -1] + in[iy*iw + ix +1] + in[(iy-1)*iw + ix] + in[(iy+1)*iw + ix]) + in[iy*iw+ix]); 
     if (out[iy*iw+ix] != temp) {std::cout << "mismatch x: " << ix << " y: " << iy << " was: " << out[iy*iw+ix] << " should be: " << temp << std::endl; return 1;}} 
    return 0; 
} 
$ nvcc -o t1219 t1219.cu 
$ cuda-memcheck ./t1219 
========= CUDA-MEMCHECK 
========= ERROR SUMMARY: 0 errors 
$ 

對於圖像尺寸要(這是真的爲我以前的測試案例)塊大小(16,16)的整倍數的這個問題不會出現 - 代碼將正常工作。對於所有其他測試用例,我們需要這樣的線程檢查。

+0

感謝您的幫助,我解決了問題:) – Jim