在下面的代碼中,如何計算sum_array值而不使用atomicAdd。如何在不使用CUDA原子的情況下進行總和計算
內核方法
__global__ void calculate_sum(int width,
int height,
int *pntrs,
int2 *sum_array)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= height || col >= width) return;
int idx = pntrs[ row * width + col ];
//atomicAdd(&sum_array[ idx ].x, col);
//atomicAdd(&sum_array[ idx ].y, row);
sum_array[ idx ].x += col;
sum_array[ idx ].y += row;
}
啓動內核
dim3 dimBlock(16, 16);
dim3 dimGrid((width + (dimBlock.x - 1))/dimBlock.x,
(height + (dimBlock.y - 1))/dimBlock.y);