2013-06-02 97 views
1

我是一名CUDA新手,第一次使用CUDA內核。 我有以下內核實現convloution(非常天真),與一個虛擬循環執行相同的元素在全局內存1000次計算(見下文)。問題是,在操作之後,結果矩陣中的某些單元格是錯誤的:從某個偏移量開始,值不是人們所期望的1000的倍數。 我的內核:CUDA atomicAdd()產生錯誤結果

__global__ void conv(float *input, float *kernel, float *target) 
{ 
    for (long i = 0; i <100; i++) 
    { 
     atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]); 
    } 
} 

爲內核調用代碼如下:

float image[1024] = {0.0}; 
float kernel[] = 
{ 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f 
}; 

float res[784]={0}; 

for (int i = 0; i < 1024; i++) 
{ 
    image[i]=(float)i; 
} // Got 32x32 matrix 

cudaError_t cudaStatus = cudaSetDevice(0); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
    exit (-1); 
} 

float *dev_image = 0; 
float *dev_kernel = 0; 
float *dev_res = 0; 

// Allocate GPU buffers for three vectors (two input, one output) . 
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    exit(-10); 
} 

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    exit(-10); 
} 

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    exit(-10); 
} 

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice); 
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice); 

cudaMemset(dev_res,0,sizeof(res)); 

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result 
dim3 blocks(28,28,1); 
dim3 threads(5,5,1); 

for (int itr = 0; itr<10; itr++) 
{ 
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res); 
} 

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost); 

printf("res[0]=%f\n",res[0]); 

cudaFree(dev_kernel); 
cudaFree(dev_image); 
cudaFree(dev_res); 

exit (0); 

看來我處理的併發問題,所以它不應該是根本原因。我感謝任何幫助。

+0

您確定您的硬件支持原子操作嗎? –

+0

當循環迭代100次時,結果爲什麼會是1000的倍數? – Joe

+0

Joe:我運行內核10次,這是1000來自的地方。 –

回答

1

你正在做任意算術的float值和期待完美的準確性。

float值可以完美地存儲整數直到某個尾數。一旦我們超過這個價值,那麼浮動操作開始變得不準確。自然,結果中的值趨向於累積到最大的數字(那些趨近res陣列末尾的值)將首先顯示此效果。

讓我們在你的內核中調用循環計數的產物,並且圍繞內核的主代碼中的循環計數爲total_loops。對於高達700左右的total_loops值,我會得到「精確」的結果,也就是說,所有結果均可由total_loops整除。之後,隨着您逐漸增加total_loops,則錯誤開始蔓延,從res陣列的末尾開始。

您可以切換到double而不是float並且您的結果會有所不同,除了atomicAdd for double版本不方便使用。然而,programming guide展示瞭如何創建任意原子操作,他們給的例子恰好可以實現atomicAdd for double

所以你的代碼的以下修改可以讓你探索這兩個觀念:

    如果你想
  • 看到這個問題的修復如何雙,改變定義了USE_DOUBLE
  • 相反,如果你想看看如何減少total_loops修復的問題,改變LOOPS1定義從100到70
  • 我還要提到它的去od做法cuda error checking on 全部 API調用和內核調用(你只覆蓋了一些,而不是內核),但在這種情況下它不是問題。

下面的代碼:

#include <stdio.h> 
#define LOOPS1 100 
#define LOOPS2 10 
// set to USE_DOUBLE or USE_FLOAT 
#define USE_FLOAT 

#ifndef USE_DOUBLE 
typedef float mytype; 
#else 
typedef double mytype; 
#endif 

__device__ double atomicAdd(double* address, double val) 
{ 
    unsigned long long int* address_as_ull = 
           (unsigned long long int*)address; 
    unsigned long long int old = *address_as_ull, assumed; 
    do { 
     assumed = old; 
     old = atomicCAS(address_as_ull, assumed, 
         __double_as_longlong(val + 
           __longlong_as_double(assumed))); 
    } while (assumed != old); 
    return __longlong_as_double(old); 
} 

__global__ void conv(mytype *input, mytype *kernel, mytype *target) 
{ 
    for (long i = 0; i <LOOPS1; i++) 
    { 
     atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]); 
    } 
} 

int main(){ 

mytype image[1024] = {0.0}; 
mytype kernel[] = 
{ 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 
    1.0f, 1.0f, 1.0f, 1.0f, 1.0f 
}; 

mytype res[784]={0}; 

for (int i = 0; i < 1024; i++) 
{ 
    image[i]=(mytype)i; 
} // Got 32x32 matrix 

cudaError_t cudaStatus = cudaSetDevice(0); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
    exit (-1); 
} 

mytype *dev_image = 0; 
mytype *dev_kernel = 0; 
mytype *dev_res = 0; 

// Allocate GPU buffers for three vectors (two input, one output) . 
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    exit(-10); 
} 

cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    exit(-10); 
} 

cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    exit(-10); 
} 

cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice); 
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice); 

cudaMemset(dev_res,0,sizeof(res)); 

    // Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result 
dim3 blocks(28,28,1); 
dim3 threads(5,5,1); 

for (int itr = 0; itr<LOOPS2; itr++) 
{ 
    conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res); 
} 

cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost); 

printf("results:\n"); 
for (int i = 0; i< (28*28); i++) 
    if ((((int)res[i])%(LOOPS1*LOOPS2)) != 0) {printf("first error index: %d, value: %f\n", i, res[i]); return 1;} 

cudaFree(dev_kernel); 
cudaFree(dev_image); 
cudaFree(dev_res); 

    return 0; 
} 

請注意,即使你使用double,問題最終會再次出現,如果你積累到足夠大的值。

另請注意,這不是一個真正的CUDA/GPU問題。主機代碼中的float具有類似的限制。

+0

感謝您的詳細解答。我今天晚些時候得到了同樣的結論。 –