我正在測試插入原子加法操作到優化陣列縮減內核中以測量性能影響的效果。我無法理解結果。我測試過五個不同的內核:CUDA縮減:原子操作不會影響性能?
0 - fully optimized reduction kernel as provided in samples/6_Advanced/reduction/reduction_kernel.cu
1 - optimized reduction kernel as described in samples/6_Advanced/docs/reduction.pdf
2 - kernel 1 with atomic warp-synchronous reduction
3 - kernel 2 with completely atomic reduction within all shared memory
4 - kernel 3 with completely atomic reduction
平均減少時間,我使用的是一個足夠大的樣本元素的裝置:
0 - 0.00103s
1 - 0.00103s
2 - 0.00103s
3 - 0.00103s
4 - 0.00117s
爲什麼原子操作似乎沒有影響無論對內核2
還是3
和一些小內核4
的影響?
Here是完整的代碼。相關的內核是:
/////////////////
// warp reduce //
/////////////////
/* warp-synchronous reduction using volatile memory
* to prevent instruction reordering for non-atomic
* operations */
template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
////////////////////////
// atomic warp reduce //
////////////////////////
/* warp-synchronous reduction using atomic operations
* to serialize computation */
template <unsigned int blockSize>
__device__ void atomicWarpReduce(int *sdata, int tid) {
if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]);
if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]);
if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]);
if (blockSize >= 8) atomicAdd(&sdata[tid], sdata[tid + 4]);
if (blockSize >= 4) atomicAdd(&sdata[tid], sdata[tid + 2]);
if (blockSize >= 2) atomicAdd(&sdata[tid], sdata[tid + 1]);
}
////////////////////////
// reduction kernel 0 //
////////////////////////
/* fastest reduction algorithm provided by
* cuda/samples/6_Advanced/reduction/reduction_kernel.cu */
template <unsigned int blockSize, bool nIsPow2>
__global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
int sum = 0;
// reduce multiple elements per thread
while (i < n) {
sum += g_idata[i];
// check bounds
if (nIsPow2 || i + blockSize < n)
sum += g_idata[i + blockSize];
i += gridSize;
}
// local sum -> shared memory
sdata[tid] = sum;
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] = sum = sum + sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] = sum = sum + sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] = sum = sum + sdata[tid + 64];
__syncthreads();
}
if (tid < 32) {
// warp-synchronous reduction
// volatile memory stores won't be reordered by compiler
volatile int *smem = sdata;
if (blockSize >= 64)
smem[tid] = sum = sum + smem[tid + 32];
if (blockSize >= 32)
smem[tid] = sum = sum + smem[tid + 16];
if (blockSize >= 16)
smem[tid] = sum = sum + smem[tid + 8];
if (blockSize >= 8)
smem[tid] = sum = sum + smem[tid + 4];
if (blockSize >= 4)
smem[tid] = sum = sum + smem[tid + 2];
if (blockSize >= 2)
smem[tid] = sum = sum + smem[tid + 1];
}
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 1 //
/////////////////////////
/* fastest reduction alrogithm described in
* cuda/samples/6_Advanced/reduction/doc/reduction.pdf */
template <unsigned int blockSize>
__global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 2 //
/////////////////////////
/* reduction kernel 1 executed
* with atomic warp-synchronous addition */
template <unsigned int blockSize>
__global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 3 //
/////////////////////////
template <unsigned int blockSize>
__global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
sdata[tid] += g_idata[i] + g_idata[i+blockSize];
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
atomicAdd(&sdata[tid], sdata[tid + 256]);
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
atomicAdd(&sdata[tid], sdata[tid + 128]);
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
atomicAdd(&sdata[tid], sdata[tid + 64]);
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
/////////////////////////
// reduction kernel 4 //
/////////////////////////
template <unsigned int blockSize>
__global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
// first level of reduction (global -> shared)
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
sdata[tid] = 0;
// reduce multiple elements per thread
while (i < n) {
atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize]));
i += gridSize;
}
__syncthreads();
// reduce in shared memory
if (blockSize >= 512) {
if (tid < 256)
atomicAdd(&sdata[tid], sdata[tid + 256]);
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128)
atomicAdd(&sdata[tid], sdata[tid + 128]);
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64)
atomicAdd(&sdata[tid], sdata[tid + 64]);
__syncthreads();
}
if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid);
// write result for block to global memory
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
明顯的第一個問題是你確定時間測量是正確的?第二個是你正在運行這些測試的設備? – talonmies
我相信時間測量是正確的。大多數情況下,測量代碼來自樣本reduction.cpp,並且在我將其投入非常低效的算法或巨大數組時,它的行爲是邏輯上的。該設備是一個Quadro 4000. – user1743798
這裏沒有太多內容。 – user1743798