2013-07-25 144 views
3

我試圖運行一個MPI矩陣乘法的例子,除了我修改它讀取文件,當然事情炸燬。使用MPI矩陣乘法

具體來說,我得到這個錯誤:

Entering first MPI_Recv in p0 and recieving data from slave processor 1 
Fatal error in MPI_Recv: Invalid count, error stack: 
MPI_Recv(186): MPI_Recv(buf=0xbfd30930, count=-1807265191, MPI_FLOAT, src=0, tag=1, MPI_COMM_WORLD, status=0x804b080) failed 
MPI_Recv(104): Negative count, value is -1807265191 

下面是修改代碼:

MPI_Init(&argc, &argv); 
MPI_Comm_rank(MPI_COMM_WORLD, &id); 
MPI_Comm_size(MPI_COMM_WORLD, &p); 
slaves = p-1; //slaves=numworkers 
/*---------------------------- master ----------------------------*/ 
if(id == 0) 
    { 
    /* check the number of arguments */ 

    if(argc!=4) 
    { 
     printf("Invalid number of aguements!\n./program matrix_file1 matrix_file2 result_matrix_file\n"); 
     return -1; 
    } 

     /* read matrix A */ 
    printf("read matrix A from %s\n", argv[1]); 
    read_matrix(argv[1],&a, &sa, &i, &j); 

    if(i != j) 
    { 
     printf("ERROR: matrix A not square\n"); 
     return -1; 
    } 



    n = i; 



    /* read matrix B */ 
    printf("read matrix B from %s\n", argv[2]); 
    read_matrix(argv[2],&b, &sb, &i, &j); 



    if(i != j) 
    {  
      printf("ERROR: matrix B not square\n"); 
      return -1; 
    } 

    if(n != i) 
    { printf("ERROR: matrix A and B incompatible\n"); 
     return -1; 
    } 



    if((n%p)!=0) 
    { 
     printf("ERROR: %d processor(s) cannot divide matrices %d x %d! \n", p,n,n); 
     return -1; 
    } 



     rows = n/slaves; 
     offset=0; 
     remainPart=n%slaves; 


    for(dest=1;dest<=slaves;dest++) 
    { 


     if(remainPart>0) 
     { 
      originalRows=rows; 
      ++rows; 
      remainPart--; 
      printf("Sending %d rows to task %d offset=%d\n",rows,dest,offset); 
      MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); 
      MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); 
      MPI_Send(&a[offset][0], rows*n, MPI_FLOAT,dest,1, MPI_COMM_WORLD); 
      MPI_Send(&b, n*n, MPI_FLOAT, dest, 1, MPI_COMM_WORLD); 
      offset = offset + rows; 
      rows = originalRows; 

     } 
     else 
     { 
      printf("Sending %d rows to task %d offset=%d\n",rows,dest,offset); 
      MPI_Send(&offset, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); 
      MPI_Send(&rows, 1, MPI_INT, dest, 1, MPI_COMM_WORLD); 
      MPI_Send(&a[offset][0], rows*n, MPI_FLOAT,dest,1, MPI_COMM_WORLD); 
      MPI_Send(&b, n*n, MPI_FLOAT, dest, 1, MPI_COMM_WORLD); 
      offset = offset + rows; 
     } 
    } 
    /* initialize matrix C */ 

    sc = (float*)malloc(n*n*sizeof(float)); 
    memset(sc, 0, n*n*sizeof(float)); 
    c = (float**)malloc(n*sizeof(float*)); 
    for(i=0; i<n; i++) c[i] = &sc[i*n]; 

    /* wait for results from all worker tasks */ 
    for (k=1; k<=slaves; k++)  
    {    
    source = k; 
    printf("Entering first MPI_Recv in p0 and recieving data from slave processor %d\n", source); 
    MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); 
    printf("Entering second MPI_Recv in p0\n"); 
    MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); 
    printf("Entering third MPI_Recv in p0\n"); 
    MPI_Recv(&c[offset][0], rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD, &status); 
    }  


    write_matrix(argv[3], sc, i, j); 

    free(sc); 
    free(c); 
    } 



if(id>0) 
{ 
     source = 0; 
     //printf("Entered first MPI_Recv for process %d\n", id); 
     MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); 
     //printf("Entered second MPI_Recv for process %d\n", id); 
     MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); 
     //printf("Entered third MPI_Recv for process %d\n", id); 
     MPI_Recv(&a, rows*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); 
     //printf("Entered fourth MPI_Recv for process %d\n", id); 
     MPI_Recv(&b, n*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); 
     /* Matrix multiplication */ 
     for (k=0; k<n; k++) 
     for (l=0; l<rows; l++) { 
     for (m=0; m<n; m++) 
      c[l][k] = c[l][k] + a[l][m] * b[m][k]; 
     } 


     //printf("Entered first MPI_send for process %d\n", id); 
     MPI_Send(&offset, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); 
     //printf("Entered second MPI_send for process %d\n", id); 
     MPI_Send(&rows, 1, MPI_INT, 0, 2, MPI_COMM_WORLD); 
     //printf("Entered third MPI_send for process %d\n", id); 
     MPI_Send(&c, rows*n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD); 


} 






MPI_Finalize();} 

之前的手,我被錯誤地通過所有的過程,而不只是工人去,所以我已經固定的但我不知道隨機負數出現在哪裏。特別是因爲在打印語句後面跟着的內容

printf("Entering first MPI_Recv in p0 and recieving data from slave processor %d\n", source); 
    MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); 
    printf("Entering second MPI_Recv in p0\n"); 
    MPI_Recv(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status); 
    printf("Entering third MPI_Recv in p0\n"); 
    MPI_Recv(&c[offset][0], rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD, &status); 

只不過是一個而原始維度n乘以給予從屬的行的平均值。先謝謝了。

更新:好,所以問題的一部分似乎是,而我的數組已經有空間分配在主控制器中,它不是這樣的奴隸進程。

一旦意識到這一點,我已經在檢查處理器是否是工人之前爲矩陣添加了緩衝區,以便傳輸它們的數據。雖然很明顯打印報表不會顯示出來,但顯然不能按計劃完成。

float buffA[n][n]; 
float buffB[n][n]; 
float buffC[n][n]; 

for(l=0;l<n;l++) 
    for(m=0;m<n;m++) 
    { 
     buffA[l][m]=a[l][m]; 
     buffB[l][m]=b[l][m]; 

         //buffA[l][m]=sa[(i*n) + j]; 
         //buffB[l][m]=sb[(i*n) + j]; 
     printf("buffA[%d][%d] =%f\n",l,m, buffA[l][m]); 
     printf("buffB[%d][%d] =%f\n",l,m,buffB[l][m]); 
    } 

if(id>0) 
{ 
     /*read_matrix(argv[1],&a, &sa, &i, &j); 
     read_matrix(argv[2],&b, &sb, &i, &j);*/ 



     source = 0; 
     printf("Entered first MPI_Recv for process %d\n", id); 
     MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); 
     printf ("offset =%d\n", offset); 
     MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); 
     printf ("row =%d\n", rows); 
     MPI_Recv(&buffA[offset][0], rows*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); 
     printf("buffA[offset][0] =%f\n", buffA[offset][0]); //they're not getting the matrices 
     MPI_Recv(&buffB, n*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); 
     //printf ("b=\n"); 

     /* Matrix multiplication */ 
     for (k=0; k<n; k++) 
     for (l=0; l<rows; l++) { 
      //c[l][k]=0.0; 
     for (m=0; m<n; m++) 
      buffC[l][k] = buffC[l][k] + buffA[l][m] * buffB[m][k]; 
      //printf("c[%d][%d]= %f\n", l,k, c[l][k]); 
     } 


     //printf("Entered first MPI_send for process %d\n", id); 
     MPI_Send(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD); 
     //printf("Entered second MPI_send for process %d\n", id); 
     MPI_Send(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD); 
     //printf("Entered third MPI_send for process %d\n", id); 
     MPI_Send(&buffC, rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD); 

     printf("Exit via MPI_send for process %d\n", id); 
} 

錯誤號碼也改變了,雖然我不確定這是否表示任何東西。

Fatal error in MPI_Recv: Invalid count, error stack: 
MPI_Recv(186): MPI_Recv(buf=0xbf8e642c, count=-8, MPI_FLOAT, src=0, tag=1,MPI_COMM_WORLD, status=0x804c088) failed 
MPI_Recv(104): Negative count, value is -8 

好吧,所以現在我發現維度n沒有被轉移,並且導致了最初的隨機負數。所以我爲n添加了send和recv。現在看來最後的問題是如何爲MPI傳輸動態分配的數組。仍在努力。

更新

它的工作原理,當前的工作代碼是像這樣,雖然乘法是遍佈我想這個地方,但嬰兒的步驟。 XP

if(id>0) 
{ 




     source = 0; 
     printf("Entered first MPI_Recv for process %d\n", id); 
     MPI_Recv(&offset, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); 
     printf ("offset =%d\n", offset); 
     MPI_Recv(&rows, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); 
     MPI_Recv(&n, 1, MPI_INT, source, 1, MPI_COMM_WORLD, &status); 
     printf ("row =%d\nn=%d\n", rows,n); 

     float buffA[rows][n]; 
     float buffB[n][n]; 
     float buffC[rows][n]; 


     MPI_Recv(&buffA[offset][0], rows*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); 
     printf("buffA[offset][0] =%f\n", buffA[offset][0]); //they're not getting the matrices 
     MPI_Recv(&buffB, n*n, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status); 
     //printf ("b=\n"); 

     /* Matrix multiplication */ 
     for (k=0; k<n; k++) 
     for (l=0; l<rows; l++) { 
      //c[l][k]=0.0; 
     for (m=0; m<n; m++) 
      //buffC[l][k] = buffC[l][k] + buffA[l][m] * buffB[m][k]; 
      //printf("c[%d][%d]= %f\n", l,k, c[l][k]); 
      buffC[l][k] = buffC[l][k] + buffA[l][m] * buffB[m][k]; 

     } 


     //printf("Entered first MPI_send for process %d\n", id); 
     MPI_Send(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD); 
     //printf("Entered second MPI_send for process %d\n", id); 
     MPI_Send(&rows, 1, MPI_INT, source, 2, MPI_COMM_WORLD); 
     //printf("Entered third MPI_send for process %d\n", id); 
     MPI_Send(&buffC, rows*n, MPI_FLOAT, source, 2, MPI_COMM_WORLD); 

     printf("Exit via MPI_send for process %d\n", id); 
} 

結果

0.00 -0.00 -0.00 -0.00 -0.00 -0.00 0.00 0.00 
0.00 -0.00 -0.00 -0.00 -1.26 -1.26 -0.00 -1.26 
-0.00 -1.26 -0.00 0.00 -0.00 0.00 0.00 0.00 
-0.00 0.00 -0.00 -0.00 0.00 -0.00 -0.00 0.00 
0.00 0.00 0.00 0.00 -0.00 -1.26 -0.00 0.00 
-0.00 -0.00 0.00 35833769696167556769392596671120015360.00 0.00 0.00 -0.00 0.00 
-0.00 -0.00 0.00 -0.00 -0.00 0.00 0.00 0.00 
0.00 -nan -0.00 -0.00 -0.00 -0.00 -0.00 -0.00 
+0

您是否嘗試過通過它運行調試器?代碼不長。你可以打電話給MPI_Recv,然後弄清楚我認爲相對容易的事情。 –

+0

到目前爲止,我只使用了打印語句。任何建議?正常的gdb看起來有點複雜。 – user2243369

+0

好的值得一試。 – user2243369

回答

2

(從評論移動,使這個問題都可以回答)

打印語句是在分佈式環境似地不可靠的。不能保證他們按順序到達彼此。 GDB真的不是那麼糟糕。你不需要附加到所有的流程,只需選擇一個。你可以在這裏查看我的答案(stackoverflow.com/questions/17347778/...),看看如何去做。

+0

但這不是問題的答案。如果OP公佈她自己的問題的答案,而不是編輯問題本身的答案,那將會更好。這個問題沒有必要有一個答案和一個評論僞裝成答案仍然是一個評論。 –

+0

夠公平的。真的,這個問題可能應該被封閉,因爲之前稱爲「太本地化」,因爲這是調試問題。然而,我在這裏查看meta的一些建議,並發現(http://meta.stackexchange.com/questions/125384/debug-this-code-for-me-questions)這裏公認的答案是告訴人們如何作爲答案進行調試。如果這封閉雖然,我不會反對。我只是不知道OP會說什麼,這實際上是對這個「問題」的「回答」。 –