2
我已經寫了下面的代碼來將兩個4x4矩陣求和。Cuda矩陣加法
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
__global__ void Matrix_add(double* a, double* b, double* c,int n)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int index = row * n + col;
if(col<n && row <n)
c[index] = a[index] + b[index];
}
int main()
{
int n=4;
double **h_a;
double **h_b;
double **h_c;
double *d_a, *d_b, *d_c;
int size = n*n*sizeof(double);
h_a = (double **) malloc(n*sizeof(double*));
h_b = (double **) malloc(n*sizeof(double*));
h_c = (double **) malloc(n*sizeof(double*));
cudaMalloc((void**)&d_a,size);
cudaMalloc((void**)&d_b,size);
cudaMalloc((void**)&d_c,size);
int t=0;
for (t=0;t<n;t++)
{
h_a[t]= (double *)malloc(n*sizeof(double));
h_b[t]= (double *)malloc(n*sizeof(double));
h_c[t]= (double *)malloc(n*sizeof(double));
}
int i=0,j=0;
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
h_a[i][j]=sin(i)*sin(i);
h_b[i][j]=cos(i)*cos(i);
}
}
cudaMemcpy(d_a,h_a+n,size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,h_b+n,size,cudaMemcpyHostToDevice);
dim3 dimBlock(4,4);
dim3 dimGrid(1,1);
Matrix_add<<<dimGrid, dimBlock>>>(d_a,d_b,d_c,n);
cudaMemcpy(h_c+n,d_c,size,cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
printf("%f",h_c[i][j]);
printf("\t");
}
printf("\n");
}
for(i=0;i<n;i++)
{
free(h_a[i]);
free(h_b[i]);
free(h_c[i]);
}
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
return 0;
}
結果該相加應該是一個2×2全1矩陣,但在結果矩陣的所有元素都爲0。此外,我得到這個消息得到結果後:
分段故障(核心傾銷)
任何人都可以請幫我找出問題。
謝謝
很好的答案。正確的解決方案是使用平面表示。該單獨分配的行數組很亂,容易出錯並且效率低下。你會習慣'h [i * row_stride + j]'(或者將它隱藏在宏的後面),並且你的簿記代碼將大大縮小。 – Peter