我是一名CUDA新手,第一次使用CUDA內核。 我有以下內核實現convloution(非常天真),與一個虛擬循環執行相同的元素在全局內存1000次計算(見下文)。問題是,在操作之後,結果矩陣中的某些單元格是錯誤的:從某個偏移量開始,值不是人們所期望的1000的倍數。 我的內核:CUDA atomicAdd()產生錯誤結果
__global__ void conv(float *input, float *kernel, float *target)
{
for (long i = 0; i <100; i++)
{
atomicAdd(target+gridDim.y*blockIdx.x+blockIdx.y,input[(blockIdx.x+threadIdx.x)*(blockDim.y+gridDim.y-1)+(blockIdx.y+threadIdx.y)]*kernel[threadIdx.x*blockDim.y+threadIdx.y]);
}
}
爲內核調用代碼如下:
float image[1024] = {0.0};
float kernel[] =
{
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
1.0f, 1.0f, 1.0f, 1.0f, 1.0f
};
float res[784]={0};
for (int i = 0; i < 1024; i++)
{
image[i]=(float)i;
} // Got 32x32 matrix
cudaError_t cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
exit (-1);
}
float *dev_image = 0;
float *dev_kernel = 0;
float *dev_res = 0;
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_image, sizeof(image));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_kernel, sizeof(kernel));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaStatus = cudaMalloc((void**)&dev_res, sizeof(res));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
exit(-10);
}
cudaMemcpy(dev_image, image, sizeof(image), cudaMemcpyHostToDevice);
cudaMemcpy(dev_kernel, kernel, sizeof(kernel), cudaMemcpyHostToDevice);
cudaMemset(dev_res,0,sizeof(res));
// Convloving 32x32 matrix with 5x5 kernel, getting 28x28 matrix as a result
dim3 blocks(28,28,1);
dim3 threads(5,5,1);
for (int itr = 0; itr<10; itr++)
{
conv<<<blocks, threads>>>(dev_image,dev_kernel, dev_res);
}
cudaMemcpy(res, dev_res, sizeof(res), cudaMemcpyDeviceToHost);
printf("res[0]=%f\n",res[0]);
cudaFree(dev_kernel);
cudaFree(dev_image);
cudaFree(dev_res);
exit (0);
看來我處理的併發問題,所以它不應該是根本原因。我感謝任何幫助。
您確定您的硬件支持原子操作嗎? –
當循環迭代100次時,結果爲什麼會是1000的倍數? – Joe
Joe:我運行內核10次,這是1000來自的地方。 –