2D卷積不正確的結果Cuda常量內存

我在內核代碼中掙扎。 我已經更新了這個包含支持文件，但提供了這些文件，並且應該是正確的。2D卷積不正確的結果Cuda常量內存

這是我的第一個GPU程序，我花了幾個小時嘗試新的東西，我似乎無法得到這個權利。它正在編譯和運行，但結果不正確。

我基本上很難理解我需要做什麼不同，因爲這個內核給出了不正確的結果。我試圖加載輸入圖像的瓷磚到共享內存（Ns [] []，我認爲我已經做得正確），並在輸入圖像瓷磚（我正在努力）上應用篩選器。

如果有經驗的人可以幫助我確定出錯的地方，並告訴我如何解決問題，我將不勝感激。如果我錯誤地問了這個問題，我很感激你的時間和歉意。

main.cu：

#include <stdio.h> 
#include "support.h" 
#include "kernel.cu" 
#include <time.h> 

int main(int argc, char* argv[]){ 
Timer timer; 
time_t t; 


// Initialize host variables ---------------------------------------------- 

printf("\nSetting up the problem..."); fflush(stdout); 
startTime(&timer); 

Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image 
Matrix N_d, P_d; 
unsigned imageHeight, imageWidth; 
cudaError_t cuda_ret; 
dim3 dim_grid, dim_block; 

/* Read image dimensions */ 
if (argc == 1) { 
    imageHeight = 600; 
    imageWidth = 1000; 
} else if (argc == 2) { 
    imageHeight = atoi(argv[1]); 
    imageWidth = atoi(argv[1]); 
} else if (argc == 3) { 
    imageHeight = atoi(argv[1]); 
    imageWidth = atoi(argv[2]); 
} else { 
    printf("\n Invalid input parameters!" 
     "\n Usage: ./convolution   # Image is 600 x 1000" 
     "\n Usage: ./convolution <m>  # Image is m x m" 
     "\n Usage: ./convolution <m> <n> # Image is m x n" 
     "\n"); 
    exit(0); 
} 

/* Allocate host memory */ 
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE); 
N_h = allocateMatrix(imageHeight, imageWidth); 
P_h = allocateMatrix(imageHeight, imageWidth); 

/* Initialize filter and images */ 
initMatrix(M_h); 
initMatrix(N_h); 

stopTime(&timer); printf("%f s\n", elapsedTime(timer)); 
printf(" Image: %u x %u\n", imageHeight, imageWidth); 
printf(" Mask: %u x %u\n", FILTER_SIZE, FILTER_SIZE); 

// Allocate device variables ---------------------------------------------- 

printf("Allocating device variables..."); fflush(stdout); 
startTime(&timer); 

N_d = allocateDeviceMatrix(imageHeight, imageWidth); 
P_d = allocateDeviceMatrix(imageHeight, imageWidth); 

cudaDeviceSynchronize(); 
stopTime(&timer); printf("%f s\n", elapsedTime(timer)); 

// Copy host variables to device ------------------------------------------ 

printf("Copying data from host to device..."); fflush(stdout); 
startTime(&timer); 

/* Copy image to device global memory */ 
copyToDeviceMatrix(N_d, N_h); 
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float)); 

dim_grid = dim3(((N_h.width/BLOCK_SIZE) + 1), ((N_h.height/BLOCK_SIZE) + 1)); 
dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE); 


cudaDeviceSynchronize(); 
stopTime(&timer); printf("%f s\n", elapsedTime(timer)); 

// Launch kernel ---------------------------------------------------------- 
printf("Launching kernel..."); fflush(stdout); 
startTime(&timer); 


convolution<<<dim_grid, dim_block>>>(N_d, P_d); 

cuda_ret = cudaDeviceSynchronize(); 
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel"); 

cudaDeviceSynchronize(); 
stopTime(&timer); printf("%f s\n", elapsedTime(timer)); 

// Copy device variables from host ---------------------------------------- 

printf("Copying data from device to host..."); fflush(stdout); 
startTime(&timer); 

copyFromDeviceMatrix(P_h, P_d); 

cudaDeviceSynchronize(); 
stopTime(&timer); printf("%f s\n", elapsedTime(timer)); 

// Verify correctness ----------------------------------------------------- 

printf("Verifying results..."); fflush(stdout); 

verify(M_h, N_h, P_h); 

// Free memory ------------------------------------------------------------ 

freeMatrix(M_h); 
freeMatrix(N_h); 
freeMatrix(P_h); 
freeDeviceMatrix(N_d); 
freeDeviceMatrix(P_d); 

return 0; 
}

kernel.cu：

__constant__ float M_c[FILTER_SIZE][FILTER_SIZE]; 
__global__ void convolution(Matrix N, Matrix P){ 

__shared__ float Ns[TILE_SIZE + 5 - 1][TILE_SIZE + 5 -1]; 
int i, j; 
float output = 0.0f; 
int tx = threadIdx.x; 
int ty = threadIdx.y; 
int row_o = blockIdx.y * TILE_SIZE + ty; 
int col_o = blockIdx.x * TILE_SIZE + tx; 
int row_i = row_o - 2; 
int col_i = col_o - 2;     
if((row_i >= 0) && (row_i < N.height) && (col_i >= 0) && (col_i < N.width)){ 
     Ns[ty][tx] = N.elements[row_i * N.width + col_i];   
} 
else{ 
     Ns[ty][tx] = 0.0f; 
} 
__syncthreads(); 
if(ty < TILE_SIZE && tx < TILE_SIZE){ 
     for(i = 0; i < 5; i++){ 
       for(j = 0; j < 5; j++){ 
       output += M_c[i][j] * Ns[i + ty][j + tx]; 
       } 
     } 
} 
if(row_o < P.height && col_o < P.width){ 
     P.elements[row_o * P.width + col_o] = output; 
} 
}

support.h：

#ifndef __FILEH__ 
#define __FILEH__ 

#include <sys/time.h> 

typedef struct { 
    struct timeval startTime; 
    struct timeval endTime; 
} Timer; 

// Matrix Structure declaration 
typedef struct { 
    unsigned int width; 
    unsigned int height; 
    unsigned int pitch; 
    float* elements; 
} Matrix; 

#define FILTER_SIZE 5 
#define TILE_SIZE 12 
#define BLOCK_SIZE (TILE_SIZE + FILTER_SIZE - 1) 

Matrix allocateMatrix(unsigned height, unsigned width); 
void initMatrix(Matrix mat); 
Matrix allocateDeviceMatrix(unsigned height, unsigned width); 
void copyToDeviceMatrix(Matrix dst, Matrix src); 
void copyFromDeviceMatrix(Matrix dst, Matrix src); 
void verify(Matrix M, Matrix N, Matrix P); 
void freeMatrix(Matrix mat); 
void freeDeviceMatrix(Matrix mat); 
void startTime(Timer* timer); 
void stopTime(Timer* timer); 
float elapsedTime(Timer timer); 

#define FATAL(msg, ...) \ 
do {\ 
    fprintf(stderr, "[%s:%d] "msg"\n", __FILE__, __LINE__, ##__VA_ARGS__);\ 
    exit(-1);\ 
} while(0) 

#if __BYTE_ORDER != __LITTLE_ENDIAN 
# error "File I/O is not implemented for this system: wrong endianness." 
#endif 
#endif

support.cu：

#include <stdlib.h> 
#include <stdio.h> 

#include "support.h" 

Matrix allocateMatrix(unsigned height, unsigned width) 
{ 
    Matrix mat; 
    mat.height = height; 
    mat.width = mat.pitch = width; 
    mat.elements = (float*)malloc(height*width*sizeof(float)); 
    if(mat.elements == NULL) FATAL("Unable to allocate host"); 

    return mat; 
} 

void initMatrix(Matrix mat) 
{ 
    for (unsigned int i=0; i < mat.height*mat.width; i++) { 
     mat.elements[i] = (rand()%100)/100.00; 
    } 
} 

Matrix allocateDeviceMatrix(unsigned height, unsigned width) 
{ 
    Matrix mat; 
    cudaError_t cuda_ret; 

    mat.height = height; 
    mat.width = mat.pitch = width; 
    cuda_ret = cudaMalloc((void**)&(mat.elements), height*width*sizeof(float)); 
    if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory"); 

    return mat; 
} 

void copyToDeviceMatrix(Matrix dst, Matrix src) 
{ 
    cudaError_t cuda_ret; 
    cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyHostToDevice); 
    if(cuda_ret != cudaSuccess) FATAL("Unable to copy to device"); 
} 

void copyFromDeviceMatrix(Matrix dst, Matrix src) 
{ 
    cudaError_t cuda_ret; 
    cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyDeviceToHost); 
    if(cuda_ret != cudaSuccess) FATAL("Unable to copy from device"); 
} 

void verify(Matrix M, Matrix N, Matrix P) { 

    const float relativeTolerance = 1e-6; 

    for(int row = 0; row < N.height; ++row) { 
    for(int col = 0; col < N.width; ++col) { 
     float sum = 0.0f; 
     for(int i = 0; i < M.height; ++i) { 
     for(int j = 0; j < M.width; ++j) { 
      int iN = row - M.height/2 + i; 
      int jN = col - M.width/2 + j; 
      if(iN >= 0 && iN < N.height && jN >= 0 && jN < N.width) { 
       sum += M.elements[i*M.width + j]*N.elements[iN*N.width + jN]; 
      } 
     } 
     } 
     float relativeError = (sum - P.elements[row*P.width + col])/sum; 
     if (relativeError > relativeTolerance 
     || relativeError < -relativeTolerance) { 
     printf("TEST FAILED\n\n"); 
     exit(0); 
     } 
    } 
    } 
    printf("TEST PASSED\n\n"); 

} 

void freeMatrix(Matrix mat) 
{ 
    free(mat.elements); 
    mat.elements = NULL; 
} 

void freeDeviceMatrix(Matrix mat) 
{ 
    cudaFree(mat.elements); 
    mat.elements = NULL; 
} 

void startTime(Timer* timer) { 
    gettimeofday(&(timer->startTime), NULL); 
} 

void stopTime(Timer* timer) { 
    gettimeofday(&(timer->endTime), NULL); 
} 

float elapsedTime(Timer timer) { 
    return ((float) ((timer.endTime.tv_sec - timer.startTime.tv_sec) \ 
       + (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6)); 
}

來源

2016-03-06 GiH

如果你的代碼沒有產生正確的結果，那麼發佈不完整的，不可編譯的代碼對任何人都沒有幫助。錯誤出現在您選擇不發佈的代碼中。您的代碼不包含API錯誤檢查。你確定沒有生成運行時錯誤嗎？如果您使用cuda-memcheck運行程序，會發生什麼 – talonmies

我編輯原始帖子以包含該信息。 – GiH

一組的問題是在這裏：

cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));

如果用cuda-memcheck運行你的代碼會點你就在這條線是一個問題。

第一個參數應該是目的地符號，即M_c，並且所述第二參數應該是主機源指針，即M_h。
此外，不應該是FILTER_SIZE*FILTER_SIZE？您要傳輸的數據大小是不是等於維度平方？
最後，M_h不是有效的源指針。你應該使用M_h.elements。

因此，像這樣：

cudaMemcpyToSymbol(M_c, M_h.elements,FILTER_SIZE*FILTER_SIZE*sizeof(float));

我不相信這能解決您的代碼中的所有問題。要繼續調試，我會在GPU結果中打印出與您的verify例程不匹配的一個元素，然後處理該元素的算術運算。如果有幫助，請在設備代碼中使用printf。

未來，請在此處尋求幫助之前，使用cuda-memcheck運行您的代碼。即使你不理解輸出結果，對於那些試圖幫助你的人也會很有幫助。

來源

2016-03-06 21:13:01

我很欣賞你接受幫助的時間。這是我第一次聽說過memcheck，因此我將在未來進行一些研究和使用。謝謝。 – GiH

2D卷積不正確的結果Cuda常量內存

回答

相關問題