我在內核代碼中掙扎。 我已經更新了這個包含支持文件,但提供了這些文件,並且應該是正確的。2D卷積不正確的結果Cuda常量內存
這是我的第一個GPU程序,我花了幾個小時嘗試新的東西,我似乎無法得到這個權利。它正在編譯和運行,但結果不正確。
我基本上很難理解我需要做什麼不同,因爲這個內核給出了不正確的結果。我試圖加載輸入圖像的瓷磚到共享內存(Ns [] [],我認爲我已經做得正確),並在輸入圖像瓷磚(我正在努力)上應用篩選器。
如果有經驗的人可以幫助我確定出錯的地方,並告訴我如何解決問題,我將不勝感激。如果我錯誤地問了這個問題,我很感激你的時間和歉意。
main.cu:
#include <stdio.h>
#include "support.h"
#include "kernel.cu"
#include <time.h>
int main(int argc, char* argv[]){
Timer timer;
time_t t;
// Initialize host variables ----------------------------------------------
printf("\nSetting up the problem..."); fflush(stdout);
startTime(&timer);
Matrix M_h, N_h, P_h; // M: filter, N: input image, P: output image
Matrix N_d, P_d;
unsigned imageHeight, imageWidth;
cudaError_t cuda_ret;
dim3 dim_grid, dim_block;
/* Read image dimensions */
if (argc == 1) {
imageHeight = 600;
imageWidth = 1000;
} else if (argc == 2) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[1]);
} else if (argc == 3) {
imageHeight = atoi(argv[1]);
imageWidth = atoi(argv[2]);
} else {
printf("\n Invalid input parameters!"
"\n Usage: ./convolution # Image is 600 x 1000"
"\n Usage: ./convolution <m> # Image is m x m"
"\n Usage: ./convolution <m> <n> # Image is m x n"
"\n");
exit(0);
}
/* Allocate host memory */
M_h = allocateMatrix(FILTER_SIZE, FILTER_SIZE);
N_h = allocateMatrix(imageHeight, imageWidth);
P_h = allocateMatrix(imageHeight, imageWidth);
/* Initialize filter and images */
initMatrix(M_h);
initMatrix(N_h);
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
printf(" Image: %u x %u\n", imageHeight, imageWidth);
printf(" Mask: %u x %u\n", FILTER_SIZE, FILTER_SIZE);
// Allocate device variables ----------------------------------------------
printf("Allocating device variables..."); fflush(stdout);
startTime(&timer);
N_d = allocateDeviceMatrix(imageHeight, imageWidth);
P_d = allocateDeviceMatrix(imageHeight, imageWidth);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy host variables to device ------------------------------------------
printf("Copying data from host to device..."); fflush(stdout);
startTime(&timer);
/* Copy image to device global memory */
copyToDeviceMatrix(N_d, N_h);
cudaMemcpyToSymbol(M_h, M_c,FILTER_SIZE*sizeof(float));
dim_grid = dim3(((N_h.width/BLOCK_SIZE) + 1), ((N_h.height/BLOCK_SIZE) + 1));
dim_block = dim3(BLOCK_SIZE, BLOCK_SIZE);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Launch kernel ----------------------------------------------------------
printf("Launching kernel..."); fflush(stdout);
startTime(&timer);
convolution<<<dim_grid, dim_block>>>(N_d, P_d);
cuda_ret = cudaDeviceSynchronize();
if(cuda_ret != cudaSuccess) FATAL("Unable to launch/execute kernel");
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Copy device variables from host ----------------------------------------
printf("Copying data from device to host..."); fflush(stdout);
startTime(&timer);
copyFromDeviceMatrix(P_h, P_d);
cudaDeviceSynchronize();
stopTime(&timer); printf("%f s\n", elapsedTime(timer));
// Verify correctness -----------------------------------------------------
printf("Verifying results..."); fflush(stdout);
verify(M_h, N_h, P_h);
// Free memory ------------------------------------------------------------
freeMatrix(M_h);
freeMatrix(N_h);
freeMatrix(P_h);
freeDeviceMatrix(N_d);
freeDeviceMatrix(P_d);
return 0;
}
kernel.cu:
__constant__ float M_c[FILTER_SIZE][FILTER_SIZE];
__global__ void convolution(Matrix N, Matrix P){
__shared__ float Ns[TILE_SIZE + 5 - 1][TILE_SIZE + 5 -1];
int i, j;
float output = 0.0f;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row_o = blockIdx.y * TILE_SIZE + ty;
int col_o = blockIdx.x * TILE_SIZE + tx;
int row_i = row_o - 2;
int col_i = col_o - 2;
if((row_i >= 0) && (row_i < N.height) && (col_i >= 0) && (col_i < N.width)){
Ns[ty][tx] = N.elements[row_i * N.width + col_i];
}
else{
Ns[ty][tx] = 0.0f;
}
__syncthreads();
if(ty < TILE_SIZE && tx < TILE_SIZE){
for(i = 0; i < 5; i++){
for(j = 0; j < 5; j++){
output += M_c[i][j] * Ns[i + ty][j + tx];
}
}
}
if(row_o < P.height && col_o < P.width){
P.elements[row_o * P.width + col_o] = output;
}
}
support.h:
#ifndef __FILEH__
#define __FILEH__
#include <sys/time.h>
typedef struct {
struct timeval startTime;
struct timeval endTime;
} Timer;
// Matrix Structure declaration
typedef struct {
unsigned int width;
unsigned int height;
unsigned int pitch;
float* elements;
} Matrix;
#define FILTER_SIZE 5
#define TILE_SIZE 12
#define BLOCK_SIZE (TILE_SIZE + FILTER_SIZE - 1)
Matrix allocateMatrix(unsigned height, unsigned width);
void initMatrix(Matrix mat);
Matrix allocateDeviceMatrix(unsigned height, unsigned width);
void copyToDeviceMatrix(Matrix dst, Matrix src);
void copyFromDeviceMatrix(Matrix dst, Matrix src);
void verify(Matrix M, Matrix N, Matrix P);
void freeMatrix(Matrix mat);
void freeDeviceMatrix(Matrix mat);
void startTime(Timer* timer);
void stopTime(Timer* timer);
float elapsedTime(Timer timer);
#define FATAL(msg, ...) \
do {\
fprintf(stderr, "[%s:%d] "msg"\n", __FILE__, __LINE__, ##__VA_ARGS__);\
exit(-1);\
} while(0)
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
#endif
support.cu:
#include <stdlib.h>
#include <stdio.h>
#include "support.h"
Matrix allocateMatrix(unsigned height, unsigned width)
{
Matrix mat;
mat.height = height;
mat.width = mat.pitch = width;
mat.elements = (float*)malloc(height*width*sizeof(float));
if(mat.elements == NULL) FATAL("Unable to allocate host");
return mat;
}
void initMatrix(Matrix mat)
{
for (unsigned int i=0; i < mat.height*mat.width; i++) {
mat.elements[i] = (rand()%100)/100.00;
}
}
Matrix allocateDeviceMatrix(unsigned height, unsigned width)
{
Matrix mat;
cudaError_t cuda_ret;
mat.height = height;
mat.width = mat.pitch = width;
cuda_ret = cudaMalloc((void**)&(mat.elements), height*width*sizeof(float));
if(cuda_ret != cudaSuccess) FATAL("Unable to allocate device memory");
return mat;
}
void copyToDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyHostToDevice);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy to device");
}
void copyFromDeviceMatrix(Matrix dst, Matrix src)
{
cudaError_t cuda_ret;
cuda_ret = cudaMemcpy(dst.elements, src.elements, src.height*src.width*sizeof(float), cudaMemcpyDeviceToHost);
if(cuda_ret != cudaSuccess) FATAL("Unable to copy from device");
}
void verify(Matrix M, Matrix N, Matrix P) {
const float relativeTolerance = 1e-6;
for(int row = 0; row < N.height; ++row) {
for(int col = 0; col < N.width; ++col) {
float sum = 0.0f;
for(int i = 0; i < M.height; ++i) {
for(int j = 0; j < M.width; ++j) {
int iN = row - M.height/2 + i;
int jN = col - M.width/2 + j;
if(iN >= 0 && iN < N.height && jN >= 0 && jN < N.width) {
sum += M.elements[i*M.width + j]*N.elements[iN*N.width + jN];
}
}
}
float relativeError = (sum - P.elements[row*P.width + col])/sum;
if (relativeError > relativeTolerance
|| relativeError < -relativeTolerance) {
printf("TEST FAILED\n\n");
exit(0);
}
}
}
printf("TEST PASSED\n\n");
}
void freeMatrix(Matrix mat)
{
free(mat.elements);
mat.elements = NULL;
}
void freeDeviceMatrix(Matrix mat)
{
cudaFree(mat.elements);
mat.elements = NULL;
}
void startTime(Timer* timer) {
gettimeofday(&(timer->startTime), NULL);
}
void stopTime(Timer* timer) {
gettimeofday(&(timer->endTime), NULL);
}
float elapsedTime(Timer timer) {
return ((float) ((timer.endTime.tv_sec - timer.startTime.tv_sec) \
+ (timer.endTime.tv_usec - timer.startTime.tv_usec)/1.0e6));
}
如果你的代碼沒有產生正確的結果,那麼發佈不完整的,不可編譯的代碼對任何人都沒有幫助。錯誤出現在您選擇不發佈的代碼中。您的代碼不包含API錯誤檢查。你確定沒有生成運行時錯誤嗎?如果您使用cuda-memcheck運行程序,會發生什麼 – talonmies
我編輯原始帖子以包含該信息。 – GiH