我試圖用另一個modelWidth * modelHeight * 31
多維數據集「卷積」一個featWidth * featHeight * 31
立方體。問題在於這個內核速度很慢(當然,我的速度比順序的CPU代碼快,但和OpenMP版本一樣慢)。我使用的是Quadro FX 1800(是的,64個CUDA內核......)。CUDA「卷積」與OpenMP版本一樣緩慢
__constant__ float d_model[31*22*22];
#define IMUL(a,b) (__mul24((a), (b)))
#define IMAD(a,b,c) (__mul24((a), (b)) + (c))
__global__ void dMatch(float *score, const int featWidth, const int featHeight, const int modelWidth, const int modelHeight, const int scoreWidth, const int scoreHeight)
{
const int x = IMAD(blockIdx.x, blockDim.x, threadIdx.x);
const int y = IMAD(blockIdx.y, blockDim.y, threadIdx.y);
if(x < scoreWidth && y < scoreHeight)
{
const int scoreIdx = IMAD(x, scoreHeight, y);
score[scoreIdx] = 0.f;
const int baseFeatIdx = IMUL(x,scoreHeight) + IMAD(modelHeight-1, x, y);
for(int z = 0; z < 31; ++z)
{
// Index positionning
int featIdx = IMAD(z, IMUL(featWidth,featHeight), baseFeatIdx);
int modelIdx = IMUL(z, IMUL(modelWidth,modelHeight));
float value = 0.f;
// filter
for(int xx=0; xx<modelWidth; xx++)
{
const int xxmodelIdx = IMAD(xx, modelHeight, modelIdx);
const int xxfeatIdx = IMAD(xx, featHeight, featIdx);
for(int yy=0; yy<modelHeight; yy++)
{
value += d_model[xxmodelIdx+yy] * tex1Dfetch(texFeatures,xxfeatIdx+yy);
}
}
score[scoreIdx] += value;
}
}
}
無論如何,我推出這個內核8*8
線程塊,並用的(scoreWidth/8)*(scoreHeight/8)
(scoreWidth和scoreHeight是結果矩陣大小)的網格尺寸。 我想知道你是否有任何線索有什麼問題,或者我的代碼中速度很慢。
編輯:
更快的版本(150毫秒下降了480毫秒的過程!)由於TERA:
__global__ void dMatch(float *score, const int featWidth, const int featHeight, const int modelWidth, const int modelHeight, const int scoreWidth, const int scoreHeight)
{
const int y = IMUL(4,IMAD(blockIdx.x, blockDim.x, threadIdx.x));
const int x = IMAD(blockIdx.y, blockDim.y, threadIdx.y);
if(x < scoreWidth && y < scoreHeight)
{
const int scoreIdx = IMAD(x, scoreHeight, y);
const int baseFeatIdx = IMUL(x,scoreHeight) + IMAD(modelHeight-1, x, y);
float value=0.f, value1 = 0.f, value2 = 0.f, value3 = 0.f;
float feat,feat1,feat2,feat3;
// Index positionning
int featIdx = 0;
int modelIdx = 0;
int xxmodelIdx;
int xxfeatIdx;
float val;
for(int z = 0; z < 31; ++z)
{
featIdx = IMAD(z,IMUL(featWidth,featHeight),baseFeatIdx);
modelIdx = IMUL(z,IMUL(modelWidth,modelHeight));
// filter
for(int xx=0; xx<modelWidth; xx++)
{
xxmodelIdx = IMAD(xx, modelHeight, modelIdx);
xxfeatIdx = IMAD(xx, featHeight, featIdx);
feat=tex1Dfetch(texFeatures,xxfeatIdx+0);
feat1=tex1Dfetch(texFeatures,xxfeatIdx+1);
feat2=tex1Dfetch(texFeatures,xxfeatIdx+2);
feat3=tex1Dfetch(texFeatures,xxfeatIdx+3);
for(int yy=0; yy<modelHeight; yy++)
{
val = d_model[xxmodelIdx+yy];
value += val * feat;
value1 += val * feat1;
value2 += val * feat2;
value3 += val * feat3;
feat = feat1;
feat1 = feat2;
feat2 = feat3;
feat3 = tex1Dfetch(texFeatures,xxfeatIdx+yy+4);
}
}
}
score[scoreIdx] = value;
if(y+1 < scoreHeight)
score[scoreIdx+1] = value1;
if(y+2 < scoreHeight)
score[scoreIdx+2] = value2;
if(y+3 < scoreHeight)
score[scoreIdx+3] = value3;
}
與此dim3 threads(16,16); dim3 grid(divup(scoreHeight,64), divup(scoreWidth,16));
推出。
一些快速的想法:使用綁定到3D陣列的紋理來利用3D局部性。在一次傳遞中計算附近偏移處的多個結果,以分攤獲取數據的成本。得到一個體面的GPU,因爲你的速度沒有任何中途要快得多。 – tera
謝謝!是的,我會嘗試3D陣列,但我並不期望太多。當然,我的GPU不是很棒......而且我的Intel Xeon W3520 @ 2.67GHz。那麼...我正在等待我的新顯卡(GTX 680),我想我不會有任何優化問題:) – eg141840
我期望生產多個輸出最多。因爲額外的輸出本質上是免費的。 – tera