2012-11-10 66 views
2

我是Cuda的新人。我試圖在內核中添加一個數組的float元素,但最終的結果是錯誤的。因爲我需要做到原子性,但另一方面,atomicAdd僅用於整數......任何想法?cuda add float array

__global__ void add_element(float *my_array, float *result_sum){ 

    int tid = blockIdx.x * blockDim.x + threadIdx.x; 
    *result_sum += my_array[tid]; 
} 

int main(int argc, char** argv){ 

    float my_array[10]; 
    float result_sum = 0; 
    float *device_array, *device_sum; 

    cudaMalloc((void**)&device_array, 10*sizeof(float)); 
    cudaMalloc((void**)&device_sum, sizeof(float)); 

    // fill the array 
    for (int i=0; i<10; i++){ 
     my_array[i] = (float)i/2; 
    } 

    cudaMemcpy(device_array, my_array, 10*sizeof(float),cudaMemcpyHostToDevice); 
    cudaMemcpy(device_sum, &result_sum, sizeof(float),cudaMemcpyHostToDevice); 

    add_element<<<1,10>>>(device_array, device_sum); 

    cudaMemcpy(&result_sum, device_sum, sizeof(float), cudaMemcpyDeviceToHost); 

    for(int i=0; i<10; i++){ 
     printf(" %f \n", my_array[i]); 
    } 
    printf("+\n----------\n %f\n", result_sum); 

    cudaFree(device_array); 
    cudaFree(device_sum); 

    return 0; 
} 

回答

1

你也可以使用atomicAdd作爲float和double。 如下:

__device__ float atomicAdd(float *address, float val) { return 0; } 

__device__ __forceinline__ float atomicAdd(float *address, float val) 
{ 
// Doing it all as longlongs cuts one __longlong_as_double from the inner loop 
unsigned int *ptr = (unsigned int *)address; 
unsigned int old, newint, ret = *ptr; 
do { 
    old = ret; 
    newint = __float_as_int(__int_as_float(old)+val); 
} while((ret = atomicCAS(ptr, old, newint)) != old); 

return __int_as_float(ret); 
} 

或 找到文件「derived_atomic_functions.h」,並在您的項目添加爲你的頭文件。

+0

謝謝,我做到了。 – Reza