__global__ void cudaEuclid(float* A, float* B, float* C, int rows, int cols)
int i, squareeucldist = 0;
int r = blockDim.x * blockIdx.x + threadIdx.x; // rows
int c = blockDim.y * blockIdx.y + threadIdx.y; // cols
extern __shared__ float sdata[];
//int r = blockIdx.y; int c = threadIdx.x;
if(r < rows && c < cols ){
//C[r + rows*c] = (A[r + rows*c] - B[r + rows*c]) * (A[r + rows*c] - B[r + rows*c]);
sdata[threadIdx.x] = (A[r + rows*c] - B[r + rows*c]) * (A[r + rows*c] - B[r + rows*c]);
// contiguous range pattern
for(int offset = blockDim.x/2;
offset > 0;
offset >>= 1)
if(threadIdx.x < offset)
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
// wait until all threads in the block have
// updated their partial sums
// thread 0 writes the final result
if(threadIdx.x == 0)
C[r] = sdata[0];
dim3 dimBlock(cols, 1);
dim3 dimGrid(1, rows);
cudaEuclid<<<dimGrid, cols, cols*sizeof(float)>>>(d_A, d_B, d_C, rows, cols);
您測試過60x8嗎?或者您在60x5時停止了嗎?奇數列似乎沒有正確處理。或者甚至可能是2的給予'偏移>> = 1'的非冪... – chappjc
它正在爲60x8工作。 – Darkmoor
有道理,這就是問題所在,儘管Eric給出了一個完整的答案。 – chappjc