1
我已經寫了一個CUDA函數來計算2D中一組點的凸包絡。但是它是極其比CPU代碼慢!CUDA warp投票函數使代碼變慢?
我正在使用warp投票函數和__syncronisation();相當多次。那麼這會讓代碼變慢嗎?
感謝
添加代碼:
__global__ void find_edges_on_device(TYPE * h_x, TYPE * h_y, int *h_edges){
int tidX = threadIdx.x;
int tidY = threadIdx.y;
int tid = tidY*blockSizeX + tidX;
int i = threadIdx.x+blockIdx.x*blockDim.x;
int j = threadIdx.y+blockIdx.y*blockDim.y;
int hxi = h_x[i];
int hxj = h_x[j];
int hyi = h_y[i];
int hyj = h_y[j];
long scalarProduct = 0;
TYPE nx;
TYPE ny;
bool isValid = true;
__shared__ int shared_X[blockSizeX*blockSizeY];
__shared__ int shared_Y[blockSizeX*blockSizeY];
__shared__ bool iswarpvalid[32];
__shared__ bool isBlockValid;
if (tid==0)
{
isBlockValid=true;
}
if (tid<(blockSizeX*blockSizeY-1)/32+1)
{
iswarpvalid[tid]=true;
}
else if (tid<32)
{
iswarpvalid[tid]=false;
}
//all the others points should be on the same side of the edge i,j
//normal to the edge (unnormalized)
nx = - (hyj- hyi);
ny = hxj- hxi;
int k=0;
while ((k==i)||(k==j))
{
k++;
} //k will be 0,1,or 2, but different from i and j to avoid
scalarProduct=nx* (h_x[k]-hxi)+ny* (h_y[k]-hyi);
if (scalarProduct<0)
{
nx*=-1;
ny*=-1;
}
for(int count = 0; count < ((NPOINTS/blockSizeX*blockSizeY) + 1); count++){
int globalIndex = tidY*blockSizeX + tidX + count*blockSizeX*blockSizeY;
if (NPOINTS <= globalIndex){
shared_X[tidY*blockSizeX + tidX] = -1;
shared_Y[tidY*blockSizeX + tidX] = -1;
}
else {
shared_X[tidY*blockSizeX + tidX]= h_x[globalIndex];
shared_Y[tidY*blockSizeX + tidX]= h_y[globalIndex];
}
__syncthreads();
//we have now at least one point with scalarProduct>0
//all the other points should comply with the same condition for
//the edge to be valid
//loop on all the points
if(i < j){
for (int k=0; k < blockSizeX*blockSizeY; k++)
{
if((count * blockSizeX*blockSizeY + k < NPOINTS)&&(isValid)) {
scalarProduct=nx* (shared_X[k]-hxi)+ny* (shared_Y[k]-hyi);
if(__all(scalarProduct) < 0){
iswarpvalid[(tidY*blockSizeX + tidX)/32] = false;
break;
}
else if(0 > (scalarProduct)){
isValid = false;
break;
}
}
}
}
__syncthreads();
if (tid<32)
{
isBlockValid=__any(iswarpvalid[tid]);
}
__syncthreads();
if(!isBlockValid) break;
}
if ((i<j) && (true == isValid)){
int tmp_i = i;
int tmp_j = j;
if(-1 != atomicCAS(&h_edges[2*i], -1, tmp_j))
h_edges[2*i+1]=j;
if(-1 != atomicCAS(&h_edges[2*j], -1, tmp_i))
h_edges[2*j+1]=i;
}
}
如果您發佈有關代碼(和/或代碼片段)的更多細節,人們將能夠參與知情推測而不是不知情的推測。 – ArchaeaSoftware