__global__ void gpu_Heat (float *h, float *g, float * sum, int N) {
int nbx, bx, nby, by;
float diff = 0.0;
nbx = (N-2)/blockDim.x;
bx = nbx/gridDim.x;
nby = (N-2)/blockDim.y;
by = nby/gridDim.y;
unsigned int ii = blockIdx.x*blockDim.x+threadIdx.x;
unsigned int jj = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int jid = (ii)*(N-2)+(jj);
for (int i=1+ii*bx; i<=min((ii+1)*bx, N-2); i++)
for (int j=1+jj*by; j<=min((jj+1)*by, N-2); j++) {
g[i*N+j]= 0.25 * (h[ i*N + (j-1)]+
h[ i*N +(j+1) ]+
h[ (i-1)*N + j]+
h[ (i+1)*N + j]);
diff = g[i*N+j] - h[i*N+j];
sum[(i-1)*(N-2)+(j-1)] = diff * diff;
}
__syncthreads();
for(unsigned int s=((N-2)*(N-2))/2; s>0; s>>=1){
if(jid<s){
sum[jid]+=sum[jid+s];
}
__syncthreads();
}
}
所以在這裏我的問題是,總和[0]的值,即包含了與內核的每個執行並行減少變化的最終結果,即使投入是相同的,我不知道我做錯了什麼。如果在CPU中減少了相同的總和矩陣,那麼執行效果很好,但是GPU的並行減少會給我帶來問題。CUDA:總和值[0]每次執行變更
dim3 Grid = (16,16);
dim3 Block = (16,16);
gpu_Heat<<<Grid,Block>>>(dev_u, dev_uhelp, dev_sum, np);
cudaThreadSynchronize(); // wait for all threads to complete
cudaErrorCheck(cudaMemcpy(param.u,dev_u,np*np*sizeof(float),cudaMemcpyDeviceToHost));
cudaErrorCheck(cudaMemcpy(param.uhelp,dev_uhelp,np*np*sizeof(float),cudaMemcpyDeviceToHost));
cudaErrorCheck(cudaMemcpy(sum,dev_sum,sum_size*sizeof(float),cudaMemcpyDeviceToHost));
我在這裏已經明確地顯示了由我正在使用的測試中的代碼計算的塊和網格參數。感謝您的回答
你可以發佈內核的函數調用,以及你用來初始化內核的網格和塊參數,它有助於弄清楚它爲什麼會失敗 – Bharat
如上所述,但是如果它改變每個調用,這意味着你的代碼中的競爭條件! –
@Bharat我已經添加了你問過的部分代碼。 –