2017-03-27 53 views
0

我正在使用MPI來並行化我的C++行星間軌跡優化程序。其中很大一部分是能夠將負載分配給多個工作節點,讓他們對分配的數據進行一些計算,並將數據返回給主節點。我認爲我在我的程序中使用了異步通信例程MPI_Isend和MPI_Irecv以及MPI_Wait。然而,我正在用EXIT CODE:11來執行突然的程序終止,我認爲這代表了分段錯誤。我已經在這個主題上徹底搜索了Stack Overflow,並確保涵蓋其他人在代碼中犯的錯誤。但是,我的代碼仍然不起作用。下面是代碼:MPI_Isend和MPI_Irecv運行到分段錯誤

mat GeneticAlgorithm::mpi_pool_fitness(mat pool, int flyby_limit, int source, int target, bool isSolar, vec mu_system, vec rp_system, cube ephemerides, IPMGAConfig config) 
{ 
    int poolsize = size(pool,0); 
    int chromsize = size(pool,1); 
    double* poolptr = NULL; 
    mat rPool = zeros(poolsize,chromsize+1); 

    int world_rank; 
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); 
    int world_size; 
    MPI_Comm_size(MPI_COMM_WORLD, &world_size); 

    MPI_Request* rq_status = (MPI_Request*)malloc(world_size*sizeof(MPI_Request)); 
    MPI_Status* status = (MPI_Status*)malloc(world_size*sizeof(MPI_Status)); 

    int k = 0; 
    if (world_rank == 0) 
    { 
     //pool.print(); 
     //initialize poolptr with input pool elements, since mat is stored in memory column by column, it's not possible to use memptr() function 
     poolptr = (double *) malloc(sizeof(double)*poolsize*chromsize); 
     for(int i=0;i<poolsize;i++) 
     { 
      for (int j=0;j<chromsize;j++) 
      { 
       poolptr[k++] = pool(i,j); 
       //cout << poolptr[k-1] << " " ; 
      } 
      //cout << endl; 
     } 
    } 

    double perproc = poolsize/(world_size-1); 
    int elems_per_proc = (int)perproc; 
    if (elems_per_proc*(world_size-1) < poolsize) 
    { 
     elems_per_proc = elems_per_proc + 1; 
    } 
    //cout << world_rank << " Elements per processor : " << elems_per_proc << endl; 
    if (world_rank == 0) 
    { 
     //cout << "poolptr size: " << k << endl; 
     //cout << "expected poolsize: " << (world_size-1)*elems_per_proc*chromsize << endl; 
     //MPI_Scatter(poolptr,elems_per_proc*chromsize,MPI_DOUBLE,row,elems_per_proc*chromsize,MPI_DOUBLE,0,MPI_COMM_WORLD); 
     for (int i=1;i<world_size;i++) 
     { 
      cout << "0 Scattering chromosomes to processor: " << i << endl; 
      MPI_Isend(&poolptr[(i-1)*elems_per_proc*chromsize],elems_per_proc*chromsize,MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]); 
     } 
     /* 
     for (int i=1;i<world_size;i++) 
     { 
      MPI_Wait(&rq_status[i],&status[i]); 
     } 
     */ 
     cout << "0 successfully sent off chromosomes for fitness evaluation....." << endl; 
     free(poolptr); 
    } 

    double *row[100]; 
    double *iResults[100]; 
    mat iPool = zeros(poolsize,chromsize+1); 
    if (world_rank != 0) 
    { 
     row[world_rank] = (double*)malloc(sizeof(double)*elems_per_proc*chromsize); 
     cout << world_rank << " Starting to receive chromosomes from processor 0" << endl; 
     MPI_Irecv(&row[world_rank],elems_per_proc*chromsize,MPI_DOUBLE,0,world_rank,MPI_COMM_WORLD,&rq_status[0]); 
     MPI_Wait(&rq_status[0],&status[0]); 
     cout << world_rank << " Received chromosomes from processor 0" << endl; 
     //Convert MPI data back to arma matrix 
     for (int i=0;i<elems_per_proc;i++) 
     { 
      cout << "Composing " << i << "th element at the given processor " << world_rank << endl; 
      k = 1; 
      for (int j=0;j<chromsize;j++,k++) 
      { 
       iPool(((world_rank-1)*elems_per_proc)+i,k)=row[world_rank][(i*chromsize)+j]; 
      } 
     } 
     //iPool.print(); 
     //Compute the fitness of each chromosome in intermediate pool 
     cout << world_rank << " Attempting fitness calculations....." << endl; 
     for (int i=0;i<elems_per_proc;i++) 
     { 
      iPool(((world_rank-1)*elems_per_proc)+i,span(0,chromsize)) = fitness_multi_rev_lambert(iPool(((world_rank-1)*elems_per_proc)+i,span(1,chromsize)),flyby_limit,source,target,isSolar,mu_system,rp_system,ephemerides,config); 
     } 
     cout << world_rank << " Successfully finished fitness calculations....." << endl; 
     //iPool.print(); 
     //Convert the results back to MPI data type 
     iResults[world_rank]=(double *) malloc(sizeof(double)*elems_per_proc*(chromsize+1));// = iPool.memptr(); 
     k=0; 
     for(int i=0;i<elems_per_proc;i++) 
     { 
      for (int j=0;j<chromsize+1;j++) 
      { 
       iResults[world_rank][k++] = iPool(((world_rank-1)*elems_per_proc)+i,j); 
      } 
     } 
     //cout << world_rank << " Starting to send processed chromosomes to processor 0" << endl; 
     MPI_Isend(&iResults[world_rank],elems_per_proc*(chromsize+1),MPI_DOUBLE,0,world_rank,MPI_COMM_WORLD,&rq_status[0]); 
     //cout << world_rank << " Sent processed chromosomes to processor 0" << endl; 
     MPI_Wait(&rq_status[0],&status[0]); 
    } 
    //Declare a variable holder for global results 
    if (world_rank == 0) 
    { 
     double* gResults = (double*)malloc(sizeof(double)*poolsize*(chromsize+1)); 

     //cout << "0 Gathering chromosomes with fitness evaluated from all processors...." << endl; 
     //MPI_Gather(iResults,elems_per_proc*(chromsize+1),MPI_DOUBLE,gResults,poolsize*(chromsize+1),MPI_DOUBLE,0,MPI_COMM_WORLD); 
     k=0; 
     for (int i=1;i<world_size;i++) 
     { 
      MPI_Irecv(&gResults[(i-1)*elems_per_proc*(chromsize+1)],elems_per_proc*(chromsize+1),MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]); 
     } 
     cout << "0 waiting to hear back from all the worker nodes...." << endl; 
     for(int i=1;i<world_size;i++) 
     { 
      MPI_Wait(&rq_status[i],&status[i]); 
     } 
     cout << "Populating return pool...." << endl; 
     for (int i=0;i<poolsize;i++) 
     { 
      for(int j=0;j<chromsize+1;j++) 
      { 
       rPool(i,j) = gResults[(i*(chromsize+1))+j]; 
      } 
     } 
     //cout << "Finished populating return pool...." << endl; 
    } 

    free(rq_status); 
    free(status); 
    return rPool; 
} 

程序似乎有我在Stack Overflow上搜索發現,各種症狀,例如,從主節點MPI_Isend只有當我指定「-n 11」或「-n工作26'在我的mpiexec。對於要使用的所有其他規格的節點數量,主節點會遇到分段故障。如果來自master的MPI_Isend正常工作,那麼工作節點正在運行到一個分段錯誤,我想在MPI_Irecv期間或稍後。

這是我從程序的一個樣本執行完整的日誌,當我與11個節點運行程序mpiexec:

10 Starting to receive chromosomes from processor 0 
Best results are in : best_results_20160217T1902.mat 
Generational chromosomes are in : chromosomes_20160217T1902.mat 
0 Starting the GA..... 
0 Processing generation : 1 
6 Starting to receive chromosomes from processor 0 
9 Starting to receive chromosomes from processor 0 
4 Starting to receive chromosomes from processor 0 
7 Starting to receive chromosomes from processor 0 
5 Starting to receive chromosomes from processor 0 
3 Starting to receive chromosomes from processor 0 
8 Starting to receive chromosomes from processor 0 
2 Starting to receive chromosomes from processor 0 
1 Starting to receive chromosomes from processor 0 
0 Scattering chromosomes to processor: 1 
0 Scattering chromosomes to processor: 2 
0 Scattering chromosomes to processor: 3 
0 Scattering chromosomes to processor: 4 
0 Scattering chromosomes to processor: 5 
0 Scattering chromosomes to processor: 6 
0 Scattering chromosomes to processor: 7 
0 Scattering chromosomes to processor: 8 
0 Scattering chromosomes to processor: 9 
0 Scattering chromosomes to processor: 10 
0 successfully sent off chromosomes for fitness evaluation..... 
0 waiting to hear back from all the worker nodes.... 

=================================================================================== 
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES 
= PID 12223 RUNNING AT 192.168.0.101 
= EXIT CODE: 11 
= CLEANING UP REMAINING PROCESSES 
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES 
=================================================================================== 
[proxy:0:[email protected]] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed 
[proxy:0:[email protected]] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status 
[proxy:0:[email protected]] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event 
[proxy:0:[email protected]] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed 
[proxy:0:[email protected]] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status 
[proxy:0:[email protected]] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event 
[proxy:0:[email protected]] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed 
[proxy:0:[email protected]] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status 
[proxy:0:[email protected]] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event 
[proxy:0:[email protected]] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed 
[proxy:0:[email protected]] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status 
[proxy:0:[email protected]] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event 
[proxy:0:[email protected]] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed 
[proxy:0:[email protected]] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status 
[proxy:0:[email protected]] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event 
[proxy:0:[email protected]] HYD_pmcd_pmip_control_cmd_cb (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip_cb.c:885): assert (!closed) failed 
[proxy:0:[email protected]] HYDT_dmxu_poll_wait_for_event (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/demux/demux_poll.c:76): callback returned error status 
[proxy:0:[email protected]] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmip.c:206): demux engine error waiting for event 
[[email protected]] HYDT_bscu_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/bootstrap/utils/bscu_wait.c:76): one of the processes terminated badly; aborting 
[[email protected]] HYDT_bsci_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/tools/bootstrap/src/bsci_wait.c:23): launcher returned error waiting for completion 
[[email protected]] HYD_pmci_wait_for_completion (/home/odroid/installers/mpich-3.2/src/pm/hydra/pm/pmiserv/pmiserv_pmci.c:218): launcher returned error waiting for completion 
[[email protected]] main (/home/odroid/installers/mpich-3.2/src/pm/hydra/ui/mpich/mpiexec.c:344): process manager error waiting for completion 

我希望在這個問題上的任何幫助,我真的上一次緊縮來完成爲我的論文截止日期執行這個程序!

回答

0

至少有一個關鍵MPI_Wait被註釋掉。

for (int i=1;i<world_size;i++) 
    { 
     cout << "0 Scattering chromosomes to processor: " << i << endl; 
     MPI_Isend(&poolptr[(i-1)*elems_per_proc*chromsize],elems_per_proc*chromsize,MPI_DOUBLE,i,i,MPI_COMM_WORLD,&rq_status[i]); 
    } 
    /* 
    for (int i=1;i<world_size;i++) 
    { 
     MPI_Wait(&rq_status[i],&status[i]); 
    } 
    */ 
    cout << "0 successfully sent off chromosomes for fitness evaluation....." << endl; 
    free(poolptr); 

前所有你不得free或寫入poolptr發送通信完成。

在一般情況下,你是過度使用非阻塞通信:

  1. 任何非阻塞傳請求你immedeately MPI_Wait的是毫無意義的。改用阻塞呼叫。
  2. 儘可能使用集體通信電話,尤其是MPI_Scatter/MPI_Gather。一般來說,如果要重疊多個通信,請使用集體,如果要將通信與計算重疊,請使用非阻塞通信。
  3. 如果您想等待多個請求,請使用MPI_Waitall

爲了更好的討論,請包括一個Minimal, Complete, and Verifiable example - 並確保清理註釋掉的東西。

0

感謝您指出我的疏忽!我最終在我的問題中利用了MPI_Scatter/MPI_Gather而不是異步通信例程,而這些例程無論如何都被濫用了。

我發現this stack overflow link的例子對成功使用MPI Scatter/Gather非常有用。