我試圖運行一個MPI矩陣乘法的例子,除了我修改它讀取文件,當然事情炸燬。使用MPI矩陣乘法
具體來說,我得到這個錯誤:
Entering first MPI_Recv in p0 and recieving data from slave processor 1
Fatal error in MPI_Recv: Invalid count, error stack:
MPI_Recv(186): MPI_Recv(buf=0xbfd30930, count=-1807265191, MPI_FLOAT, src=0, tag=1, MPI_COMM_WORLD, status=0x804b080) failed
MPI_Recv(104): Negative count, value is -1807265191
下面是修改代碼:
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
MPI_Comm_size(MPI_COMM_WORLD, &p);
slaves = p-1; //slaves=numworkers
/*---------------------------- master ----------------------------*/
if(id == 0)
{
/* check the number of arguments */
if(argc!=4)
{
printf("Invalid number of aguements!\n./program matrix_file1 matrix_file2 result_matrix_file\n");
return -1;
}
/* read matrix A */
printf("read matrix A from %s\n", argv[1]);
read_matrix(argv[1],&a, &sa, &i, &j);
if(i != j)
{
printf("ERROR: matrix A not square\n");
return -1;
}
n = i;
/* read matrix B */
printf("read matrix B from %s\n", argv[2]);
read_matrix(argv[2],&b, &sb, &i, &j);
if(i != j)
{
printf("ERROR: matrix B not square\n");
return -1;
}
if(n != i)
{ printf("ERROR: matrix A and B incompatible\n");
return -1;
}
if((n%p)!=0)
{
printf("ERROR: %d processor(s) cannot divide matrices %d x %d! \n", p,n,n);
return -1;
}
rows = n/slaves;
offset=0;
remainPart=n%slaves;
for(dest=1;dest<=slaves;dest++)
{
if(remainPart>0)
{
originalRows=rows;
++rows;
remainPart--;
printf("Sending %d rows to task %d offset=%d\n",rows,dest,offset);
MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&a[offset][0], rows*n, MPI_FLOAT,dest,1, MPI_COMM_WORLD);
MPI_Send(&b, n*n, MPI_FLOAT, dest, 1, MPI_COMM_WORLD);
offset = offset + rows;
rows = originalRows;
}
else
{
printf("Sending %d rows to task %d offset=%d\n",rows,dest,offset);
MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD);
MPI_Send(&a[offset][0], rows*n, MPI_FLOAT,dest,1, MPI_COMM_WORLD);
MPI_Send(&b, n*n, MPI_FLOAT, dest, 1, MPI_COMM_WORLD);
offset = offset + rows;
}
}
/* initialize matrix C */
sc = (float*)malloc(n*n*sizeof(float));
memset(sc, 0, n*n*sizeof(float));
c = (float**)malloc(n*sizeof(float*));
for(i=0; i<n; i++) c[i] = &sc[i*n];
/* wait for results from all worker tasks */
for (k=1; k<=slaves; k++)
{
source = k;
printf("Entering first MPI_Recv in p0 and recieving data from slave processor %d\n", source);
MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
printf("Entering second MPI_Recv in p0\n");
MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
printf("Entering third MPI_Recv in p0\n");
MPI_Recv(&c[offset][0], rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD, &status);
}
write_matrix(argv[3], sc, i, j);
free(sc);
free(c);
}
if(id>0)
{
source = 0;
//printf("Entered first MPI_Recv for process %d\n", id);
MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
//printf("Entered second MPI_Recv for process %d\n", id);
MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
//printf("Entered third MPI_Recv for process %d\n", id);
MPI_Recv(&a, rows*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
//printf("Entered fourth MPI_Recv for process %d\n", id);
MPI_Recv(&b, n*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
/* Matrix multiplication */
for (k=0; k<n; k++)
for (l=0; l<rows; l++) {
for (m=0; m<n; m++)
c[l][k] = c[l][k] + a[l][m] * b[m][k];
}
//printf("Entered first MPI_send for process %d\n", id);
MPI_Send(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
//printf("Entered second MPI_send for process %d\n", id);
MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
//printf("Entered third MPI_send for process %d\n", id);
MPI_Send(&c, rows*n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD);
}
MPI_Finalize();}
之前的手,我被錯誤地通過所有的過程,而不只是工人去,所以我已經固定的但我不知道隨機負數出現在哪裏。特別是因爲在打印語句後面跟着的內容
printf("Entering first MPI_Recv in p0 and recieving data from slave processor %d\n", source);
MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
printf("Entering second MPI_Recv in p0\n");
MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
printf("Entering third MPI_Recv in p0\n");
MPI_Recv(&c[offset][0], rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD, &status);
只不過是一個而原始維度n乘以給予從屬的行的平均值。先謝謝了。
更新:好,所以問題的一部分似乎是,而我的數組已經有空間分配在主控制器中,它不是這樣的奴隸進程。
一旦意識到這一點,我已經在檢查處理器是否是工人之前爲矩陣添加了緩衝區,以便傳輸它們的數據。雖然很明顯打印報表不會顯示出來,但顯然不能按計劃完成。
float buffA[n][n];
float buffB[n][n];
float buffC[n][n];
for(l=0;l<n;l++)
for(m=0;m<n;m++)
{
buffA[l][m]=a[l][m];
buffB[l][m]=b[l][m];
//buffA[l][m]=sa[(i*n) + j];
//buffB[l][m]=sb[(i*n) + j];
printf("buffA[%d][%d] =%f\n",l,m, buffA[l][m]);
printf("buffB[%d][%d] =%f\n",l,m,buffB[l][m]);
}
if(id>0)
{
/*read_matrix(argv[1],&a, &sa, &i, &j);
read_matrix(argv[2],&b, &sb, &i, &j);*/
source = 0;
printf("Entered first MPI_Recv for process %d\n", id);
MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
printf ("offset =%d\n", offset);
MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
printf ("row =%d\n", rows);
MPI_Recv(&buffA[offset][0], rows*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
printf("buffA[offset][0] =%f\n", buffA[offset][0]); //they're not getting the matrices
MPI_Recv(&buffB, n*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
//printf ("b=\n");
/* Matrix multiplication */
for (k=0; k<n; k++)
for (l=0; l<rows; l++) {
//c[l][k]=0.0;
for (m=0; m<n; m++)
buffC[l][k] = buffC[l][k] + buffA[l][m] * buffB[m][k];
//printf("c[%d][%d]= %f\n", l,k, c[l][k]);
}
//printf("Entered first MPI_send for process %d\n", id);
MPI_Send(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD);
//printf("Entered second MPI_send for process %d\n", id);
MPI_Send(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD);
//printf("Entered third MPI_send for process %d\n", id);
MPI_Send(&buffC, rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD);
printf("Exit via MPI_send for process %d\n", id);
}
錯誤號碼也改變了,雖然我不確定這是否表示任何東西。
Fatal error in MPI_Recv: Invalid count, error stack:
MPI_Recv(186): MPI_Recv(buf=0xbf8e642c, count=-8, MPI_FLOAT, src=0, tag=1,MPI_COMM_WORLD, status=0x804c088) failed
MPI_Recv(104): Negative count, value is -8
好吧,所以現在我發現維度n沒有被轉移,並且導致了最初的隨機負數。所以我爲n添加了send和recv。現在看來最後的問題是如何爲MPI傳輸動態分配的數組。仍在努力。
更新
它的工作原理,當前的工作代碼是像這樣,雖然乘法是遍佈我想這個地方,但嬰兒的步驟。 XP
if(id>0)
{
source = 0;
printf("Entered first MPI_Recv for process %d\n", id);
MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
printf ("offset =%d\n", offset);
MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
MPI_Recv(&n, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status);
printf ("row =%d\nn=%d\n", rows,n);
float buffA[rows][n];
float buffB[n][n];
float buffC[rows][n];
MPI_Recv(&buffA[offset][0], rows*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
printf("buffA[offset][0] =%f\n", buffA[offset][0]); //they're not getting the matrices
MPI_Recv(&buffB, n*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
//printf ("b=\n");
/* Matrix multiplication */
for (k=0; k<n; k++)
for (l=0; l<rows; l++) {
//c[l][k]=0.0;
for (m=0; m<n; m++)
//buffC[l][k] = buffC[l][k] + buffA[l][m] * buffB[m][k];
//printf("c[%d][%d]= %f\n", l,k, c[l][k]);
buffC[l][k] = buffC[l][k] + buffA[l][m] * buffB[m][k];
}
//printf("Entered first MPI_send for process %d\n", id);
MPI_Send(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD);
//printf("Entered second MPI_send for process %d\n", id);
MPI_Send(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD);
//printf("Entered third MPI_send for process %d\n", id);
MPI_Send(&buffC, rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD);
printf("Exit via MPI_send for process %d\n", id);
}
結果
0.00 -0.00 -0.00 -0.00 -0.00 -0.00 0.00 0.00
0.00 -0.00 -0.00 -0.00 -1.26 -1.26 -0.00 -1.26
-0.00 -1.26 -0.00 0.00 -0.00 0.00 0.00 0.00
-0.00 0.00 -0.00 -0.00 0.00 -0.00 -0.00 0.00
0.00 0.00 0.00 0.00 -0.00 -1.26 -0.00 0.00
-0.00 -0.00 0.00 35833769696167556769392596671120015360.00 0.00 0.00 -0.00 0.00
-0.00 -0.00 0.00 -0.00 -0.00 0.00 0.00 0.00
0.00 -nan -0.00 -0.00 -0.00 -0.00 -0.00 -0.00
您是否嘗試過通過它運行調試器?代碼不長。你可以打電話給MPI_Recv,然後弄清楚我認爲相對容易的事情。 –
到目前爲止,我只使用了打印語句。任何建議?正常的gdb看起來有點複雜。 – user2243369
好的值得一試。 – user2243369