我想計算CUDA中數組的所有元素的總和。我想出了這個代碼。它編譯沒有任何錯誤。但結果總是爲零。我從cudaMemcpyFromSymbol
得到了無效的設備符號。我不能使用任何類似Thrust或Cublas的庫。無效的設備符號cudaMemcpyFromSymbol CUDA
#define TRIALS_PER_THREAD 4096
#define NUM_BLOCKS 256
#define NUM_THREADS 256
double *dev;
__device__ volatile double pi_gpu = 0;
__global__ void ArraySum(double *array)
{
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
pi_gpu = pi_gpu + array[tid];
__syncthreads();
}
int main (int argc, char *argv[]) {
cudaMalloc((void **) &dev, NUM_BLOCKS * NUM_THREADS * sizeof(double));
double pi_gpu_h;
ArraySum<<<NUM_BLOCKS, NUM_THREADS>>>(dev);
cudaDeviceSynchronize();
cudaError err = cudaMemcpyFromSymbol(&pi_gpu_h, &pi_gpu, sizeof(double), cudaMemcpyDeviceToHost);
if(cudaSuccess != err)
{
fprintf(stderr, "cudaMemcpyFromSymbolfailed : %s\n", cudaGetErrorString(err));
exit(-1);
}
return pi_gpu_h; // this is always zero!!!
}
如果你花了30秒來正確格式化你的代碼,閱讀發佈非常困難。 – talonmies