2017-02-10 161 views
1

我想計算CUDA中數組的所有元素的總和。我想出了這個代碼。它編譯沒有任何錯誤。但結果總是爲零。我從cudaMemcpyFromSymbol得到了無效的設備符號。我不能使用任何類似Thrust或Cublas的庫。無效的設備符號cudaMemcpyFromSymbol CUDA

#define TRIALS_PER_THREAD 4096 
#define NUM_BLOCKS 256 
#define NUM_THREADS 256 
double *dev; 
__device__ volatile double pi_gpu = 0; 

__global__ void ArraySum(double *array) 

{ 
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x; 
pi_gpu = pi_gpu + array[tid]; 
__syncthreads(); 
} 

int main (int argc, char *argv[]) { 
cudaMalloc((void **) &dev, NUM_BLOCKS * NUM_THREADS * sizeof(double)); 
    double pi_gpu_h; 

ArraySum<<<NUM_BLOCKS, NUM_THREADS>>>(dev); 
cudaDeviceSynchronize(); 
cudaError err = cudaMemcpyFromSymbol(&pi_gpu_h, &pi_gpu, sizeof(double), cudaMemcpyDeviceToHost); 
if(cudaSuccess != err) 
{ 
    fprintf(stderr, "cudaMemcpyFromSymbolfailed : %s\n", cudaGetErrorString(err)); 
    exit(-1); 
} 
return pi_gpu_h; // this is always zero!!! 
} 
+0

如果你花了30秒來正確格式化你的代碼,閱讀發佈非常困難。 – talonmies

回答

-2

您的代碼不是線程安全的。從多個線程寫入全局變量是不安全的。如何減少核可能是這個例子:

//Untested code 
global_void plus_reduce(int *input, int N, int *total){ 
    int tid = threadIdx.x; 
    int i = blockIdx.x*blockDim.x + threadIdx.x; 
    // Each block loads its elements into shared memory 
    _shared_ int x[blocksize]; 
    x[tid] = (i<N) ? input[i] : 0; // last block may pad with 0’s 
    _syncthreads(); 
    // Build summation tree over elements. 
    for(int s=blockDim.x/2; s>0; s=s/2){ 
     if(tid < s) x[tid] += x[tid + s]; 
    _syncthreads(); 
} 
// Thread 0 adds the partial sum to the total sum 
if(tid == 0) 
    atomicAdd(total, x[tid]);     
} 

Source

+0

這是如何回答這個問題的? – talonmies

3

從符號調用拷貝符號的說法是不正確。它應該看起來像這樣:

cudaMemcpyFromSymbol(&pi_gpu_h, pi_gpu, sizeof(double), cudaMemcpyDeviceToHost)