存在於計算能力1.3 GPU的全局內存中的一個無符號字符數組的步進存取問題。爲了繞過全局存儲器的聚結的要求,螺紋依次訪問全局存儲器和複製僅使用2存儲器事務以下示例陣列到共享存儲器:如何避免將數據從全局數據加載到共享內存時發生銀行衝突
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
__global__ void kernel (unsigned char *d_text, unsigned char *d_out) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ unsigned char s_array[];
uint4 *uint4_text = (uint4 *) d_text;
uint4 var;
//memory transaction
var = uint4_text[0];
uchar4 c0 = *reinterpret_cast<uchar4 *>(&var.x);
uchar4 c4 = *reinterpret_cast<uchar4 *>(&var.y);
uchar4 c8 = *reinterpret_cast<uchar4 *>(&var.z);
uchar4 c12 = *reinterpret_cast<uchar4 *>(&var.w);
s_array[threadIdx.x*16 + 0] = c0.x;
s_array[threadIdx.x*16 + 1] = c0.y;
s_array[threadIdx.x*16 + 2] = c0.z;
s_array[threadIdx.x*16 + 3] = c0.w;
s_array[threadIdx.x*16 + 4] = c4.x;
s_array[threadIdx.x*16 + 5] = c4.y;
s_array[threadIdx.x*16 + 6] = c4.z;
s_array[threadIdx.x*16 + 7] = c4.w;
s_array[threadIdx.x*16 + 8] = c8.x;
s_array[threadIdx.x*16 + 9] = c8.y;
s_array[threadIdx.x*16 + 10] = c8.z;
s_array[threadIdx.x*16 + 11] = c8.w;
s_array[threadIdx.x*16 + 12] = c12.x;
s_array[threadIdx.x*16 + 13] = c12.y;
s_array[threadIdx.x*16 + 14] = c12.z;
s_array[threadIdx.x*16 + 15] = c12.w;
d_out[idx] = s_array[threadIdx.x*16];
}
int main (void) {
unsigned char *d_text, *d_out;
unsigned char *h_out = (unsigned char *) malloc (32 * sizeof (unsigned char));
unsigned char *h_text = (unsigned char *) malloc (32 * sizeof (unsigned char));
int i;
for (i = 0; i < 32; i++)
h_text[i] = 65 + i;
cudaMalloc ((void**) &d_text, 32 * sizeof (unsigned char));
cudaMalloc ((void**) &d_out, 32 * sizeof (unsigned char));
cudaMemcpy (d_text, h_text, 32 * sizeof (unsigned char), cudaMemcpyHostToDevice);
kernel<<<1,32,16128>>>(d_text, d_out);
cudaMemcpy (h_out, d_out, 32 * sizeof (unsigned char), cudaMemcpyDeviceToHost);
for (i = 0; i < 32; i++)
printf("%c\n", h_out[i]);
return 0;
}
的問題是,組衝突在將數據複製到共享內存時發生(由nvprof報告,上述示例的衝突爲384次),這會導致線程的序列化訪問。
共享內存分爲16個(或更新設備體系結構中的32個)32位存儲區,以便同時服務相同半經線的16個線程。數據交錯存儲在第i個32位字始終存儲在i%16-1共享存儲區中。
由於每個線程讀取一個內存事務的16個字節,這些字符將以交叉方式存儲到共享內存中。這會導致線程0,4,8,12之間的衝突; 1,5,9,13; 2,6,10,14; 3,7,11,15是同一個半經線。消除體衝突一個天真的方法是使用的if/else分支將數據存儲在類似以下內容的循環方式共享內存,但導致一些嚴重的線程分歧:
int tid16 = threadIdx.x % 16;
if (tid16 < 4) {
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
} else if (tid16 < 8) {
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
} else if (tid16 < 12) {
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
} else {
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
}
任何人都可以想出一個更好的解決方案呢?我已經研究過SDK的縮減例子,但我不確定它適用於我的問題。
我實際上將uint4字(128位)直接存儲到共享內存中。每字節應對是嘗試找到解決銀行衝突問題的臨時方法 – charis