2013-06-19 84 views
0

我正在測試插入原子加法操作到優化陣列縮減內核中以測量性能影響的效果。我無法理解結果。我測試過五個不同的內核:CUDA縮減:原子操作不會影響性能?

0 - fully optimized reduction kernel as provided in samples/6_Advanced/reduction/reduction_kernel.cu 
1 - optimized reduction kernel as described in samples/6_Advanced/docs/reduction.pdf 
2 - kernel 1 with atomic warp-synchronous reduction 
3 - kernel 2 with completely atomic reduction within all shared memory 
4 - kernel 3 with completely atomic reduction 

平均減少時間,我使用的是一個足夠大的樣本元素的裝置:

0 - 0.00103s 
1 - 0.00103s 
2 - 0.00103s 
3 - 0.00103s 
4 - 0.00117s 

爲什麼原子操作似乎沒有影響無論對內核2還是3和一些小內核4的影響?

Here是完整的代碼。相關的內核是:

///////////////// 
// warp reduce // 
///////////////// 
/* warp-synchronous reduction using volatile memory 
* to prevent instruction reordering for non-atomic 
* operations */ 

template <unsigned int blockSize> 
__device__ void warpReduce(volatile int *sdata, int tid) { 
    if (blockSize >= 64) sdata[tid] += sdata[tid + 32]; 
    if (blockSize >= 32) sdata[tid] += sdata[tid + 16]; 
    if (blockSize >= 16) sdata[tid] += sdata[tid + 8]; 
    if (blockSize >= 8) sdata[tid] += sdata[tid + 4]; 
    if (blockSize >= 4) sdata[tid] += sdata[tid + 2]; 
    if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; 
} 

    //////////////////////// 
// atomic warp reduce // 
//////////////////////// 
/* warp-synchronous reduction using atomic operations 
* to serialize computation */ 

template <unsigned int blockSize> 
__device__ void atomicWarpReduce(int *sdata, int tid) { 
    if (blockSize >= 64) atomicAdd(&sdata[tid], sdata[tid + 32]); 
    if (blockSize >= 32) atomicAdd(&sdata[tid], sdata[tid + 16]); 
    if (blockSize >= 16) atomicAdd(&sdata[tid], sdata[tid + 8]); 
    if (blockSize >= 8) atomicAdd(&sdata[tid], sdata[tid + 4]); 
    if (blockSize >= 4) atomicAdd(&sdata[tid], sdata[tid + 2]); 
    if (blockSize >= 2) atomicAdd(&sdata[tid], sdata[tid + 1]); 
} 

    //////////////////////// 
// reduction kernel 0 // 
//////////////////////// 
/* fastest reduction algorithm provided by 
* cuda/samples/6_Advanced/reduction/reduction_kernel.cu */ 

template <unsigned int blockSize, bool nIsPow2> 
__global__ void reduce0(int *g_idata, int *g_odata, unsigned int n) { 
    extern __shared__ int sdata[]; 
    // first level of reduction (global -> shared) 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 
    int sum = 0; 
    // reduce multiple elements per thread 
    while (i < n) { 
    sum += g_idata[i]; 
    // check bounds 
    if (nIsPow2 || i + blockSize < n) 
     sum += g_idata[i + blockSize]; 
    i += gridSize; 
    } 
    // local sum -> shared memory 
    sdata[tid] = sum; 
    __syncthreads(); 
    // reduce in shared memory 
    if (blockSize >= 512) { 
    if (tid < 256) 
     sdata[tid] = sum = sum + sdata[tid + 256]; 
    __syncthreads(); 
    } 
    if (blockSize >= 256) { 
    if (tid < 128) 
     sdata[tid] = sum = sum + sdata[tid + 128]; 
    __syncthreads(); 
    } 
    if (blockSize >= 128) { 
    if (tid < 64) 
     sdata[tid] = sum = sum + sdata[tid + 64]; 
    __syncthreads(); 
    } 
    if (tid < 32) { 
    // warp-synchronous reduction 
    // volatile memory stores won't be reordered by compiler 
    volatile int *smem = sdata; 
    if (blockSize >= 64) 
     smem[tid] = sum = sum + smem[tid + 32]; 
    if (blockSize >= 32) 
     smem[tid] = sum = sum + smem[tid + 16]; 
    if (blockSize >= 16) 
     smem[tid] = sum = sum + smem[tid + 8]; 
    if (blockSize >= 8) 
     smem[tid] = sum = sum + smem[tid + 4]; 
    if (blockSize >= 4) 
     smem[tid] = sum = sum + smem[tid + 2]; 
    if (blockSize >= 2) 
     smem[tid] = sum = sum + smem[tid + 1]; 
    } 
    // write result for block to global memory 
    if (tid == 0) 
    g_odata[blockIdx.x] = sdata[0]; 
} 

    ///////////////////////// 
// reduction kernel 1 // 
///////////////////////// 
/* fastest reduction alrogithm described in 
* cuda/samples/6_Advanced/reduction/doc/reduction.pdf */ 

template <unsigned int blockSize> 
__global__ void reduce1(int *g_idata, int *g_odata, unsigned int n) { 
    extern __shared__ int sdata[]; 
    // first level of reduction (global -> shared) 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 
    sdata[tid] = 0; 
    // reduce multiple elements per thread 
    while (i < n) { 
    sdata[tid] += g_idata[i] + g_idata[i+blockSize]; 
    i += gridSize; 
    } 
    __syncthreads(); 
    // reduce in shared memory 
    if (blockSize >= 512) { 
    if (tid < 256) 
     sdata[tid] += sdata[tid + 256]; 
    __syncthreads(); 
    } 
    if (blockSize >= 256) { 
    if (tid < 128) 
     sdata[tid] += sdata[tid + 128]; 
    __syncthreads(); 
    } 
    if (blockSize >= 128) { 
    if (tid < 64) 
     sdata[tid] += sdata[tid + 64]; 
    __syncthreads(); 
    } 
    if (tid < 32) warpReduce<blockSize>(sdata, tid); 
    // write result for block to global memory 
    if (tid == 0) 
    g_odata[blockIdx.x] = sdata[0]; 
} 

    ///////////////////////// 
// reduction kernel 2 // 
///////////////////////// 
/* reduction kernel 1 executed 
* with atomic warp-synchronous addition */ 

template <unsigned int blockSize> 
__global__ void reduce2(int *g_idata, int *g_odata, unsigned int n) { 
    extern __shared__ int sdata[]; 
    // first level of reduction (global -> shared) 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 
    sdata[tid] = 0; 
    // reduce multiple elements per thread 
    while (i < n) { 
    sdata[tid] += g_idata[i] + g_idata[i+blockSize]; 
    i += gridSize; 
    } 
    __syncthreads(); 
    // reduce in shared memory 
    if (blockSize >= 512) { 
    if (tid < 256) 
     sdata[tid] += sdata[tid + 256]; 
    __syncthreads(); 
    } 
    if (blockSize >= 256) { 
    if (tid < 128) 
     sdata[tid] += sdata[tid + 128]; 
    __syncthreads(); 
    } 
    if (blockSize >= 128) { 
    if (tid < 64) 
     sdata[tid] += sdata[tid + 64]; 
    __syncthreads(); 
    } 
    if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid); 
    // write result for block to global memory 
    if (tid == 0) 
    g_odata[blockIdx.x] = sdata[0]; 
} 

    ///////////////////////// 
// reduction kernel 3 // 
///////////////////////// 

template <unsigned int blockSize> 
__global__ void reduce3(int *g_idata, int *g_odata, unsigned int n) { 
    extern __shared__ int sdata[]; 
    // first level of reduction (global -> shared) 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 
    sdata[tid] = 0; 
    // reduce multiple elements per thread 
    while (i < n) { 
    sdata[tid] += g_idata[i] + g_idata[i+blockSize]; 
    i += gridSize; 
    } 
    __syncthreads(); 
    // reduce in shared memory 
    if (blockSize >= 512) { 
    if (tid < 256) 
     atomicAdd(&sdata[tid], sdata[tid + 256]); 
    __syncthreads(); 
    } 
    if (blockSize >= 256) { 
    if (tid < 128) 
     atomicAdd(&sdata[tid], sdata[tid + 128]); 
    __syncthreads(); 
    } 
    if (blockSize >= 128) { 
    if (tid < 64) 
     atomicAdd(&sdata[tid], sdata[tid + 64]); 
    __syncthreads(); 
    } 
    if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid); 
    // write result for block to global memory 
    if (tid == 0) 
    g_odata[blockIdx.x] = sdata[0]; 
} 

    ///////////////////////// 
// reduction kernel 4 // 
///////////////////////// 

template <unsigned int blockSize> 
__global__ void reduce4(int *g_idata, int *g_odata, unsigned int n) { 
    extern __shared__ int sdata[]; 
    // first level of reduction (global -> shared) 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 
    sdata[tid] = 0; 
    // reduce multiple elements per thread 
    while (i < n) { 
    atomicAdd(&sdata[tid], (g_idata[i] + g_idata[i+blockSize])); 
    i += gridSize; 
    } 
    __syncthreads(); 
    // reduce in shared memory 
    if (blockSize >= 512) { 
    if (tid < 256) 
     atomicAdd(&sdata[tid], sdata[tid + 256]); 
    __syncthreads(); 
    } 
    if (blockSize >= 256) { 
    if (tid < 128) 
     atomicAdd(&sdata[tid], sdata[tid + 128]); 
    __syncthreads(); 
    } 
    if (blockSize >= 128) { 
    if (tid < 64) 
     atomicAdd(&sdata[tid], sdata[tid + 64]); 
    __syncthreads(); 
    } 
    if (tid < 32) atomicWarpReduce<blockSize>(sdata, tid); 
    // write result for block to global memory 
    if (tid == 0) 
    g_odata[blockIdx.x] = sdata[0]; 
} 
+1

明顯的第一個問題是你確定時間測量是正確的?第二個是你正在運行這些測試的設備? – talonmies

+0

我相信時間測量是正確的。大多數情況下,測量代碼來自樣本reduction.cpp,並且在我將其投入非常低效的算法或巨大數組時,它的行爲是邏輯上的。該設備是一個Quadro 4000. – user1743798

+0

這裏沒有太多內容。 – user1743798

回答

2

在你的代碼,你不使用的內核調用適當的CUDA error checking。由於時序完全相同,我強烈懷疑你的內核並沒有真正啓動。我已經通過自己的CUDA縮減設置進行了驗證,當縮減元素的數量爲1<<24時,可以實現相同的計時。上述CUDA錯誤檢查返回無效配置參數

我有機會提及您的atomicWarpReduce__device__函數實際上是不正確的,因爲它缺乏適當的同步(另請參閱線程Removing __syncthreads() in CUDA warp-level reduction)。正確的版本是

template <class T> 
__device__ void atomicWarpReduce(T *sdata, int tid) { 
    atomicAdd(&sdata[tid], sdata[tid + 32]); __syncthreads(); 
    atomicAdd(&sdata[tid], sdata[tid + 16]); __syncthreads(); 
    atomicAdd(&sdata[tid], sdata[tid + 8]); __syncthreads(); 
    atomicAdd(&sdata[tid], sdata[tid + 4]); __syncthreads(); 
    atomicAdd(&sdata[tid], sdata[tid + 2]); __syncthreads(); 
    atomicAdd(&sdata[tid], sdata[tid + 1]); __syncthreads(); 
} 

當然,你不需要原子在這種情況下,我明白,這只是爲了理解。但是atomics並不強制執行同步,只是通過連續訪問共享內存數組sdata來避免競爭條件(反正不存在)。您可能希望通過確保比較反彙編代碼

您的版本

Function : _Z18reduce4_atomicWarpIiEvPT_S1_j 
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)" 
    /*0000*/   MOV R1, c[0x1][0x100];       /* 0x2800440400005de4 */ 
    /*0008*/   S2R R0, SR_CTAID.X;        /* 0x2c00000094001c04 */ 
    /*0010*/   SHL R3, R0, 0x1;        /* 0x6000c0000400dc03 */ 
    /*0018*/   S2R R2, SR_TID.X;        /* 0x2c00000084009c04 */ 
    /*0020*/   IMAD R3, R3, c[0x0][0x8], R2;     /* 0x200440002030dca3 */ 
    /*0028*/   IADD R4, R3, c[0x0][0x8];      /* 0x4800400020311c03 */ 
    /*0030*/   ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */ 
    /*0038*/   ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */ 
    /*0040*/  @P0 ISCADD R3, R3, c[0x0][0x20], 0x2;    /* 0x400040008030c043 */ 
    /*0048*/ @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2;    /* 0x4000400080412443 */ 
    /*0050*/ @!P0 MOV R5, RZ;          /* 0x28000000fc0161e4 */ 
    /*0058*/ @!P1 LD R4, [R4];         /* 0x8000000000412485 */ 
    /*0060*/  @P0 LD R5, [R3];         /* 0x8000000000314085 */ 
    /*0068*/   SHL R3, R2, 0x2;        /* 0x6000c0000820dc03 */ 
    /*0070*/   NOP;           /* 0x4000000000001de4 */ 
    /*0078*/ @!P1 IADD R5, R4, R5;        /* 0x4800000014416403 */ 
    /*0080*/   MOV R4, c[0x0][0x8];       /* 0x2800400020011de4 */ 
    /*0088*/   STS [R3], R5;         /* 0xc900000000315c85 */ 
    /*0090*/   BAR.RED.POPC RZ, RZ, RZ, PT;     /* 0x50ee0000ffffdc04 */ 
    /*0098*/   MOV R6, c[0x0][0x8];       /* 0x2800400020019de4 */ 
    /*00a0*/   ISETP.LT.U32.AND P0, PT, R6, 0x42, PT;   /* 0x188ec0010861dc03 */ 
    /*00a8*/  @P0 BRA 0x118;          /* 0x40000001a00001e7 */ 
    /*00b0*/   NOP;           /* 0x4000000000001de4 */ 
    /*00b8*/   NOP;           /* 0x4000000000001de4 */ 
    /*00c0*/   MOV R6, R4;          /* 0x2800000010019de4 */ 
    /*00c8*/   SHR.U32 R4, R4, 0x1;       /* 0x5800c00004411c03 */ 
    /*00d0*/   ISETP.GE.U32.AND P0, PT, R2, R4, PT;   /* 0x1b0e00001021dc03 */ 
    /*00d8*/ @!P0 IADD R7, R4, R2;        /* 0x480000000841e003 */ 
    /*00e0*/ @!P0 SHL R7, R7, 0x2;        /* 0x6000c0000871e003 */ 
    /*00e8*/ @!P0 LDS R7, [R7];         /* 0xc10000000071e085 */ 
    /*00f0*/ @!P0 IADD R5, R7, R5;        /* 0x4800000014716003 */ 
    /*00f8*/ @!P0 STS [R3], R5;         /* 0xc900000000316085 */ 
    /*0100*/   BAR.RED.POPC RZ, RZ, RZ, PT;     /* 0x50ee0000ffffdc04 */ 
    /*0108*/   ISETP.GT.U32.AND P0, PT, R6, 0x83, PT;   /* 0x1a0ec0020c61dc03 */ 
    /*0110*/  @P0 BRA 0xc0;          /* 0x4003fffea00001e7 */ 
    /*0118*/   ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT;   /* 0x1a0ec0007c21dc03 */ 
    /*0120*/   SSY 0x2a8;          /* 0x6000000600000007 */ 
    /*0128*/  @P0 BRA 0x2a0;          /* 0x40000005c00001e7 */ 
    /*0130*/   LDS R4, [R3+0x80];        R4 = sdata[tid + 32] 
    /*0138*/   SSY 0x168;          
    /*0140*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*0148*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*0150*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*0158*/ @!P0 BRA 0x140;          /* 0x4003ffff800021e7 */ 
    /*0160*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*0168*/   LDS R4, [R3+0x40];        R4 = sdata[tid + 16] 
    /*0170*/   SSY 0x1a8;          

    /*0178*/   NOP;           /* 0x4000000000001de4 */ 

    /*0180*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*0188*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*0190*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*0198*/ @!P0 BRA 0x180;          /* 0x4003ffff800021e7 */ 
    /*01a0*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*01a8*/   LDS R4, [R3+0x20];        R4 = sdata[tid + 8] 
    /*01b0*/   SSY 0x1e8;          

    /*01b8*/   NOP;           /* 0x4000000000001de4 */ 

    /*01c0*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*01c8*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*01d0*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*01d8*/ @!P0 BRA 0x1c0;          /* 0x4003ffff800021e7 */ 
    /*01e0*/   NOP.S;           /* 0x4000000000001df4 */ 

    /*01e8*/   LDS R6, [R3+0x10];        /* 0xc100000040319c85 */ 
    /*01f0*/   LDS R5, [R3+0x8];        /* 0xc100000020315c85 */ 
    /*01f8*/   LDS R4, [R3+0x4];        /* 0xc100000010311c85 */ 
    /*0200*/   SSY 0x230;          /* 0x60000000a0000007 */ 
    /*0208*/   LDSLK P0, R7, [R3];        /* 0xc40000000031dc85 */ 
    /*0210*/  @P0 IADD R7, R7, R6;        /* 0x480000001871c003 */ 
    /*0218*/  @P0 STSUL [R3], R7;         /* 0xcc0000000031c085 */ 
    /*0220*/ @!P0 BRA 0x208;          /* 0x4003ffff800021e7 */ 
    /*0228*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*0230*/   SSY 0x268;          /* 0x60000000c0000007 */ 
    /*0238*/   NOP;           /* 0x4000000000001de4 */ 
    /*0240*/   LDSLK P0, R6, [R3];        /* 0xc400000000319c85 */ 
    /*0248*/  @P0 IADD R6, R6, R5;        /* 0x4800000014618003 */ 
    /*0250*/  @P0 STSUL [R3], R6;         /* 0xcc00000000318085 */ 
    /*0258*/ @!P0 BRA 0x240;          /* 0x4003ffff800021e7 */ 
    /*0260*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*0268*/   NOP;           /* 0x4000000000001de4 */ 
    /*0270*/   NOP;           /* 0x4000000000001de4 */ 
    /*0278*/   NOP;           /* 0x4000000000001de4 */ 
    /*0280*/   LDSLK P0, R5, [R3];        /* 0xc400000000315c85 */ 
    /*0288*/  @P0 IADD R5, R5, R4;        /* 0x4800000010514003 */ 
    /*0290*/  @P0 STSUL [R3], R5;         /* 0xcc00000000314085 */ 
    /*0298*/ @!P0 BRA 0x280;          /* 0x4003ffff800021e7 */ 
    /*02a0*/   ISETP.NE.AND.S P0, PT, R2, RZ, PT;    /* 0x1a8e0000fc21dc33 */ 
    /*02a8*/  @P0 BRA.U 0x2c8;         /* 0x40000000600081e7 */ 
    /*02b0*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;    /* 0x4000400090002043 */ 
    /*02b8*/ @!P0 LDS R2, [RZ];         /* 0xc100000003f0a085 */ 
    /*02c0*/ @!P0 ST [R0], R2;         /* 0x900000000000a085 */ 
    /*02c8*/   EXIT;           /* 0x8000000000001de7 */ 

正確的版本

Function : _Z18reduce4_atomicWarpIiEvPT_S1_j 
.headerflags @"EF_CUDA_SM21 EF_CUDA_PTX_SM(EF_CUDA_SM21)" 
    /*0000*/   MOV R1, c[0x1][0x100];       /* 0x2800440400005de4 */ 
    /*0008*/   S2R R0, SR_CTAID.X;        /* 0x2c00000094001c04 */ 
    /*0010*/   SHL R3, R0, 0x1;        /* 0x6000c0000400dc03 */ 
    /*0018*/   S2R R2, SR_TID.X;        /* 0x2c00000084009c04 */ 
    /*0020*/   IMAD R3, R3, c[0x0][0x8], R2;     /* 0x200440002030dca3 */ 
    /*0028*/   IADD R4, R3, c[0x0][0x8];      /* 0x4800400020311c03 */ 
    /*0030*/   ISETP.LT.U32.AND P0, PT, R3, c[0x0][0x28], PT; /* 0x188e4000a031dc03 */ 
    /*0038*/   ISETP.GE.U32.AND P1, PT, R4, c[0x0][0x28], PT; /* 0x1b0e4000a043dc03 */ 
    /*0040*/  @P0 ISCADD R3, R3, c[0x0][0x20], 0x2;    /* 0x400040008030c043 */ 
    /*0048*/ @!P1 ISCADD R4, R4, c[0x0][0x20], 0x2;    /* 0x4000400080412443 */ 
    /*0050*/ @!P0 MOV R5, RZ;          /* 0x28000000fc0161e4 */ 
    /*0058*/ @!P1 LD R4, [R4];         /* 0x8000000000412485 */ 
    /*0060*/  @P0 LD R5, [R3];         /* 0x8000000000314085 */ 
    /*0068*/   SHL R3, R2, 0x2;        /* 0x6000c0000820dc03 */ 
    /*0070*/   NOP;           /* 0x4000000000001de4 */ 
    /*0078*/ @!P1 IADD R5, R4, R5;        /* 0x4800000014416403 */ 
    /*0080*/   MOV R4, c[0x0][0x8];       /* 0x2800400020011de4 */ 
    /*0088*/   STS [R3], R5;         /* 0xc900000000315c85 */ 
    /*0090*/   BAR.RED.POPC RZ, RZ, RZ, PT;     /* 0x50ee0000ffffdc04 */ 
    /*0098*/   MOV R6, c[0x0][0x8];       /* 0x2800400020019de4 */ 
    /*00a0*/   ISETP.LT.U32.AND P0, PT, R6, 0x42, PT;   /* 0x188ec0010861dc03 */ 
    /*00a8*/  @P0 BRA 0x118;          /* 0x40000001a00001e7 */ 
    /*00b0*/   NOP;           /* 0x4000000000001de4 */ 
    /*00b8*/   NOP;           /* 0x4000000000001de4 */ 
    /*00c0*/   MOV R6, R4;          /* 0x2800000010019de4 */ 
    /*00c8*/   SHR.U32 R4, R4, 0x1;       /* 0x5800c00004411c03 */ 
    /*00d0*/   ISETP.GE.U32.AND P0, PT, R2, R4, PT;   /* 0x1b0e00001021dc03 */ 
    /*00d8*/ @!P0 IADD R7, R4, R2;        /* 0x480000000841e003 */ 
    /*00e0*/ @!P0 SHL R7, R7, 0x2;        /* 0x6000c0000871e003 */ 
    /*00e8*/ @!P0 LDS R7, [R7];         /* 0xc10000000071e085 */ 
    /*00f0*/ @!P0 IADD R5, R7, R5;        /* 0x4800000014716003 */ 
    /*00f8*/ @!P0 STS [R3], R5;         /* 0xc900000000316085 */ 
    /*0100*/   BAR.RED.POPC RZ, RZ, RZ, PT;     __syncthreds() 
    /*0108*/   ISETP.GT.U32.AND P0, PT, R6, 0x83, PT;   /* 0x1a0ec0020c61dc03 */ 
    /*0110*/  @P0 BRA 0xc0;          /* 0x4003fffea00001e7 */ 
    /*0118*/   ISETP.GT.U32.AND P0, PT, R2, 0x1f, PT;   
    /*0120*/   SSY 0x2b8;          
    /*0128*/  @P0 BRA 0x2b0;          /* 0x40000006000001e7 */ 
    /*0130*/   LDS R4, [R3+0x80];        R4 = sdata[tid + 32] 
    /*0138*/   SSY 0x168;          
    /*0140*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*0148*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*0150*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*0158*/ @!P0 BRA 0x140;          /* 0x4003ffff800021e7 */ 
    /*0160*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*0168*/   BAR.RED.POPC RZ, RZ, RZ, PT;     __syncthreads() 
    /*0170*/   LDS R4, [R3+0x40];        R4 = sdata[tid + 16] 
    /*0178*/   SSY 0x1a8;          

    /*0180*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*0188*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*0190*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*0198*/ @!P0 BRA 0x180;          /* 0x4003ffff800021e7 */ 
    /*01a0*/   NOP.S;           /* 0x4000000000001df4 */ 

    /*01a8*/   BAR.RED.POPC RZ, RZ, RZ, PT;     __syncthreads() 
    /*01b0*/   LDS R4, [R3+0x20];        R4 = sdata[tid + 8] 
    /*01b8*/   SSY 0x1e8;          
    /*01c0*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*01c8*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*01d0*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*01d8*/ @!P0 BRA 0x1c0;          /* 0x4003ffff800021e7 */ 
    /*01e0*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*01e8*/   BAR.RED.POPC RZ, RZ, RZ, PT;     __syncthreads() 
    /*01f0*/   LDS R4, [R3+0x10];        R4 = sdata[tid + 4] 
    /*01f8*/   SSY 0x228;          
    /*0200*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*0208*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*0210*/  @P0 STSUL [R3], R5;         R5 = R5 + R4 
    /*0218*/ @!P0 BRA 0x200;          /* 0x4003ffff800021e7 */ 
    /*0220*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*0228*/   BAR.RED.POPC RZ, RZ, RZ, PT;     __syncthreads() 
    /*0230*/   LDS R4, [R3+0x8];        R4 = sdata[tid + 2] 
    /*0238*/   SSY 0x268;          
    /*0240*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*0248*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*0250*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*0258*/ @!P0 BRA 0x240;          /* 0x4003ffff800021e7 */ 
    /*0260*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*0268*/   BAR.RED.POPC RZ, RZ, RZ, PT;     __syncthreads() 
    /*0270*/   LDS R4, [R3+0x4];        R4 = sdata[tid + 1] 
    /*0278*/   SSY 0x2a8;          
    /*0280*/   LDSLK P0, R5, [R3];        R5 = sdata[tid] (load from shared memory and lock) 
    /*0288*/  @P0 IADD R5, R5, R4;        R5 = R5 + R4 
    /*0290*/  @P0 STSUL [R3], R5;         sdata[tid] = R5 (store to shared memory and unlock) 
    /*0298*/ @!P0 BRA 0x280;          /* 0x4003ffff800021e7 */ 
    /*02a0*/   NOP.S;           /* 0x4000000000001df4 */ 
    /*02a8*/   BAR.RED.POPC RZ, RZ, RZ, PT;     /* 0x50ee0000ffffdc04 */ 
    /*02b0*/   ISETP.NE.AND.S P0, PT, R2, RZ, PT;    /* 0x1a8e0000fc21dc33 */ 
    /*02b8*/  @P0 BRA.U 0x2d8;         /* 0x40000000600081e7 */ 
    /*02c0*/ @!P0 ISCADD R0, R0, c[0x0][0x24], 0x2;    /* 0x4000400090002043 */ 
    /*02c8*/ @!P0 LDS R2, [RZ];         /* 0xc100000003f0a085 */ 
    /*02d0*/ @!P0 ST [R0], R2;         /* 0x900000000000a085 */ 
    /*02d8*/   EXIT;           /* 0x8000000000001de7 */ 

返回到你的真正的問題,內核被正確啓動,您可以輕鬆驗證原子對性能的影響。