C++ MPI上SLURM

-1

我上運行SLURM和我的Mac相同的升壓MPI C++程序段錯誤。兩者都使用1節點16核心運行....但是，在我的Mac上，我在SLURM超級計算系統上從來沒有遇到分段故障錯誤，如果我運行在許多內核上，幾乎總會遇到分段故障（如果我只運行4個核，錯誤不會發生，如果在32個核心，那麼它幾乎總是發生）（它只是看起來像，每次）C++ MPI上SLURM

[c558-403.stampede.tacc.utexas.edu:mpi_rank_0][error_sighandler] Caught error: Segmentation fault (signal 11)

分割故障是由隨機發生的事情寫的每一個執行在屏幕上線，我意識到賽格故障主發生在：

 mpi_world.recv(dst_rank, 1,spin_tmp);

和spin_tmp是：

 map<tuple<int,int,int,int>,int> spin_tmp;

其中的元組是從boost元組。我真的很掙扎，不知道地球上正在發生的事情....

我想在這裏給最少的代碼版本：

的main.cpp

int main (void){ 

    mpi::environment mpi_env; 
    mpi::communicator mpi_world; 
    if (mpi_world.rank()==0) 
    { 
     read_from_file(id, max_sites, J, prec, num_loops, translation_algorithm, 
         basic_exact_mode, pseudo_mode, pseudo_mode_with_proof, 
         verbose, very_verbose, obscenely_verbose,input_PRIM_output_PRIMOUT_mode,limit_dimension,constant,mu,mu_translation_algorithm,mu_constant,work_with_mu,scan_chemical_potential,new_cluster_algorithm,use_new_pair_terms,use_new_triplet_terms, output_more_states,output_states_below_hull,how_much_lower,output_states_how_sparse,use_level_method,use_weighted_dual_average,global_parameters); 

    } 
    else 
    { 
     bool restart_signal=true; 

     periodic_slave(); 
     return 0; 
    } 



     corecode(components, x_range, y_range, z_range, max_sites, num_loops, 
      Ji, 
      lowerboundclustertype, upperboundclustertype, cellrepresentation, 
      lower_bound, upper_bound, unitcell, periodicity, J_for_proof, 
      id, pseudo_mode, pseudo_mode_with_proof, basic_exact_mode, 
      very_verbose, obscenely_verbose,limit_dimension,new_cluster_algorithm,use_new_pair_terms,use_new_triplet_terms,map_periodicity_to_spin, use_level_method, use_weighted_dual_average,global_parameters); 
}

在periodic_function

。 CPP（具體爲從站）

void periodic_slave() 
{ 
    mpi::communicator mpi_world; 



    cout<<"\n hello I am slave: "<<mpi_world.rank()<<" received the global parameters"<<endl; 


    bool kill_all_slave=false; 

    broadcast(mpi_world,kill_all_slave,0); 

    while (kill_all_slave==false) { 


     vector<tuple<int,int,int,int,int,int> > periodicity_vector; 

     std::cout << "[SLAVE: " << mpi_world.rank()<< "] I am waiting to receive periodicity vector"<<endl; 

     periodicity_vector.clear(); 

     broadcast(mpi_world,periodicity_vector,0); 

     std::cout << "[SLAVE: " << mpi_world.rank()<< "] I am waiting to J "<<endl; 

//  J.clear(); 

     map<set<tuple<int,int,int,int,int> >, double> J; 


     broadcast(mpi_world,J,0); 

     int x_range,y_range,z_range; 
     double min_bound; 
     map<int,int> component; 

     cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" received 3rd set of the global parameters"<<endl; 

     broadcast(mpi_world,x_range,0); 
     broadcast(mpi_world,y_range,0); 

     cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" received 4th set of the global parameters"<<endl; 
     broadcast(mpi_world,z_range,0); 
     broadcast(mpi_world,component,0); 
     cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" received 5th set of the global parameters"<<endl; 


     broadcast(mpi_world,min_bound,0); 

     cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" received 6th set of the global parameters"<<endl; 

     bool stop = false; 
     string status="initial"; 
     mpi_world.send(0, 0, status); 
//   
     cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" I am going to send to root status"<<endl; 


     mpi_world.recv(0, 0, stop); 

     cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" I am going to receive stop signal from root"<<endl; 


     while(!stop) { 

      map< tuple<int,int,int,int,int,int>, map<set<tuple<int,int,int,int,int> >, double> > clustertype_periodic; 

      // Wait for new job 

      unsigned int job_id = 0; 

      cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" I am going to receive job id from root"<<endl; 

      mpi_world.recv(0, 0, job_id); 

      cout<< "[SLAVE: " << mpi_world.rank()<<"] hello I am slave: "<<mpi_world.rank()<<" I am going to receive stop min bound from root"<<endl; 


      mpi_world.recv(0, 1, min_bound); 


      std::cout << "[SLAVE: " << mpi_world.rank() 

      << "] Received job " << job_id << " from MASTER.\n"<<endl; 

      // Perform "job" 

      map<tuple<int,int,int,int>,int> spin_tmp; 
      double energy_tmp=0; 

      tuple<int,int,int,int,int,int> periodicity_now; 
      { 
       int i=job_id; 
       periodicity_now=periodicity_vector[i]; 
       int a0=periodicity_now.get<0>(); 
       int a1=periodicity_now.get<1>(); 
       int a2=periodicity_now.get<2>(); 
       int a3=periodicity_now.get<3>(); 
       int a4=periodicity_now.get<4>(); 
       int a5=periodicity_now.get<5>(); 

       { 
        if (pseudo_mode) { 



         periodic(a0, a1, a2, a3, a4, a5, 
           J, x_range, y_range, z_range, component, 
           spin_tmp, 
           energy_tmp, 
           clustertype_periodic, min_bound, id, 
           true, false, false, 
           obscenely_verbose,global_parameters); 

         //     { 
         //      spin_periodic[make_tuple(a0,a1,a2,a3,a4,a5)]=spin_tmp; 
         //      energy_periodic[make_tuple(a0,a1,a2,a3,a4,a5)]=energy_tmp; 
         //     } 
        } 
        else if (basic_exact_mode){ 




         periodic(a0, a1, a2, a3, a4, a5, 
           J, x_range, y_range, z_range, component, 
           spin_tmp, 
           energy_tmp, 
           clustertype_periodic, min_bound, id, 
           false, true, false, 
           obscenely_verbose,global_parameters); 

         //     { 
         //      spin_periodic[make_tuple(a0,a1,a2,a3,a4,a5)]=spin_tmp; 
         //      energy_periodic[make_tuple(a0,a1,a2,a3,a4,a5)]=energy_tmp; 
         //     } 


        } 

       } 
      } 



      // Notify master that the job is done 

      std::cout << "[SLAVE: " << mpi_world.rank() 

      << "] Done with job " << job_id << ". Notifying MASTER.\n"<<endl; 

      status="finish"; 

      mpi_world.send(0, 0,status); 
//   std::cout << "[SLAVE: " << mpi_world.rank()<< "] I have sent status back to master"<<endl; 

      //note here, I deliberated exclude cluster_type_periodic which may be the culprit of a lot of bad computing performance for realistic system 

      //send periodicity, spin_tmp and energy_tmp 

      //  and remember to send more detail; 


      mpi_world.send(0, 1,spin_tmp); 
      std::cout << "[SLAVE: " << mpi_world.rank()<< "] I have sent spin_tmp back to master"<<endl; 
      mpi_world.send(0, 2,energy_tmp); 
      std::cout << "[SLAVE: " << mpi_world.rank()<< "] I have sent energy_tmp back to master"<<endl; 
      mpi_world.send(0, 3,periodicity_now); 
      std::cout << "[SLAVE: " << mpi_world.rank()<< "] I have sent periodicity_now back to master"<<endl; 


      // Check if a new job is coming 

      mpi_world.recv(0, 0, stop); 
      std::cout << "[SLAVE: " << mpi_world.rank()<< "] I have receive stop from master and stop is "<<stop<<endl; 


     } 


     std::cout << "[SLAVE: " << mpi_world.rank()<< "] I am waiting for signal whether kill_all_slave"<<endl; 


     broadcast(mpi_world,kill_all_slave,0); 

     std::cout << "[SLAVE: " << mpi_world.rank()<< "] I receive kill_all_slave signal as "<<kill_all_slave<<endl; 


    } 



    std::cout << "~~~~~~~~ Rank " << mpi_world.rank() << " is exiting ~~~~~~~~~~~\n"<<endl; 




}

對於solver.cpp（對於MASTER）

void corecode(map<int, int> component, 
       int x_range, int y_range, int z_range, 
       int max_sites, 
       int loopnumber, 
       map<set<tuple<int,int,int,int,int> >, double> &J, 
       map<set<tuple<int,int,int,int,int> >, double> &lowerboundclustertype, 
       map<set<tuple<int,int,int,int,int> >, double> &upperboundclustertype, 
       map<tuple<int,int,int,int>,int> &cellrepresentation, 
       double &lower_bound, double &upper_bound, 
       map<tuple<int,int,int,int>,int> &unitcell, 
       tuple<int,int,int,int,int,int> &periodicity, 
       map<set<tuple<int,int,int,int,int> >, double> &J_for_proof, 
       std::string id, 
       bool pseudo_mode, 
       bool pseudo_mode_with_proof, 
       bool basic_exact_mode, 
       bool very_verbose, 
       bool obscenely_verbose, 
       double limit_dimension,bool new_cluster_algorithm,int use_new_pair_terms,int use_new_triplet_terms, map<tuple<int,int,int,int,int,int>, map<tuple<int,int,int,int>,int> > &map_periodicity_to_spin, bool use_level_method,bool use_weighted_dual_average ,solver_variable &global_parameters) 
{ 

    mpi::communicator mpi_world; 


    cout<<"\n what is global_parameters.dedicated1D: "<<global_parameters.dedicated1D<<" what is dimension==1:"<<(dimension==1)<<endl; 

    if (!(global_parameters.dedicated1D==true&&dimension==1)) { 


     // Obtain upper bound estimate on the energy 
     if (max_sites > 0){ 

      vector<tuple<int,int,int,int,int,int> > periodicity_vector; 
      periodicity_vector.clear(); 

      bool done=false; 

      cout<<"\n debug 02819374 periodicity_vector.size(): "<<periodicity_vector.size()<<endl; 

      bool kill_all_slave=false; 
      usleep(100000); 
      cout<<"[MASTER] I am broadcasting kill_all_slave"<<endl; 

      broadcast(mpi_world,kill_all_slave,0); 

      usleep(100000); 
      cout<<"[MASTER] I am broadcasting periodicity_vector"<<endl; 

      broadcast(mpi_world,periodicity_vector,0); 

      usleep(1000000); 
      cout<<"[MASTER] I am broadcasting J"<<endl; 

      broadcast(mpi_world,J,0); 

      usleep(1000000); 
      cout<<"[MASTER] I am broadcasting x_range"<<endl; 

      broadcast(mpi_world,x_range,0); 
      usleep(100000); 
      cout<<"[MASTER] I am broadcasting y_range"<<endl; 

      broadcast(mpi_world,y_range,0); 
      usleep(100000); 
      cout<<"[MASTER] I am broadcasting z_range"<<endl; 

      broadcast(mpi_world,z_range,0); 
      usleep(100000); 
      cout<<"[MASTER] I am broadcasting component"<<endl; 

      broadcast(mpi_world,component,0); 
      usleep(100000); 
      cout<<"[MASTER] I am broadcasting minbound"<<endl; 

      broadcast(mpi_world,min_bound,0); 
      usleep(100000); 
      //   lets write something here, for loop starts here: 
      set<int> killed_slave; 
      map<int, string > status_of_slave; 

      // Initialize requests 
      unsigned int job_id = 0; 
      std::vector<mpi::request> reqs(mpi_world.size()); 

      //   cout<<"\n hello I am root: I am going to send to receieve status status"<<endl; 

      // Send initial jobs 
      for (unsigned int dst_rank = 1; dst_rank < mpi_world.size(); ++dst_rank) { 

       // Post receive request for new jobs requests by slave [nonblocking] 
       cout<<"[MASTER] I am i receving status_of_slave: "<<dst_rank<<endl; 

       reqs[dst_rank] = mpi_world.irecv(dst_rank, 0,status_of_slave[dst_rank]); 
       ++job_id; 
      } 

      //   cout<<"\n hello I am root: I finish status request"<<endl; 


      // Send jobs as long as there is job left 
      while(job_id < periodicity_vector.size()) { 
       bool stop; 
       for (unsigned int dst_rank = 1; dst_rank < mpi_world.size(); ++dst_rank) { 
        // Check if dst_rank is done 
        //     cout<<"status_of_slave["<<dst_rank<<"] is "<<status_of_slave[dst_rank] <<endl; 
        if (reqs[dst_rank].test()) { 
         cout<<"[MASTER] I receive reqs[dst_rank].test() is: "<<1<<endl; 
         cout<<"status_of_slave["<<dst_rank<<"] is "<<status_of_slave[dst_rank] <<endl; 

         if(status_of_slave[dst_rank]=="finish") { 
          //note here, I deliberated exclude cluster_type_periodic which may be the culprit of a lot of bad computing performance for realistic system 

          cout<<"[MASTER] I receive finish status "<<endl; 

          //       remember to receive more detail output; 
          //       and update the things you want to update; 
          map<tuple<int,int,int,int>,int> spin_tmp; 
          double energy_tmp=0; 
          mpi_world.recv(dst_rank, 1,spin_tmp); 
          cout<<"[MASTER] I receive spin_tmp status "<<endl; 

          mpi_world.recv(dst_rank, 2,energy_tmp); 
          cout<<"[MASTER] I receive energy_tmp status "<<endl; 

          tuple<int,int,int,int,int,int> periodicity_now; 

          mpi_world.recv(dst_rank, 3,periodicity_now); 
          cout<<"[MASTER] I receive periodicity_now status "<<endl; 




         } 

         std::cout << "[MASTER] Rank " << dst_rank << " is done.\n"<<endl; 
         // Check if there is remaining jobs 
         if (job_id < periodicity_vector.size()) { 
          // Tell the slave that a new job is coming. 
          stop = false; 
          mpi_world.isend(dst_rank, 0, stop); 
          // Send the new job. 
          //       std::cout << "[MASTER] Sending new job (" << job_id 
          //       << ") to SLAVE " << dst_rank << ".\n"<<endl; 
          mpi_world.isend(dst_rank, 0, job_id); 
          mpi_world.isend(dst_rank, 1, min_bound); 

          reqs[dst_rank] = mpi_world.irecv(dst_rank, 0,status_of_slave[dst_rank]); 
          ++job_id; 
         } 
         else { 
          // Send stop message to slave. 
          stop = true; 
          mpi_world.send(dst_rank, 0, stop); 
          killed_slave.insert(dst_rank); 
         } 
        } 
       } 
       usleep(1000); 
      } 
      std::cout << "[MASTER] Sent all jobs.\n"; 

      // Listen for the remaining jobs, and send stop messages on completion. 
      bool all_done = false; 
      while (!all_done) { 
       all_done = true; 
       for (unsigned int dst_rank = 1; dst_rank < mpi_world.size(); ++dst_rank) { 

        //     cout<<"[MASTER] check if Slave "<<dst_rank<<" is killed?"<<endl; 

        if(killed_slave.count(dst_rank)==0) 
        { 
         //      cout<<"[MASTER] Slave "<<dst_rank<<" is not yet killed"<<endl; 
         // 
         //      cout<<"[MASTER] last round; checking status message from slave "<<dst_rank<<endl; 

         if (reqs[dst_rank].test()) { 

          cout<<"[MASTER] last round; checking status message from slave "<<dst_rank<<endl; 
          cout<<"status_of_slave["<<dst_rank<<"] is "<<status_of_slave[dst_rank] <<endl; 


          if(status_of_slave[dst_rank]=="finish") { 
           //note here, I deliberated exclude cluster_type_periodic which may be the culprit of a lot of bad computing performance for realistic system 

           cout<<"[MASTER] I receive finish status from slave "<<dst_rank<<endl; 

           //       remember to receive more detail output; 
           //       and update the things you want to update; 
           map<tuple<int,int,int,int>,int> spin_tmp; 
           double energy_tmp=0; 
           mpi_world.recv(dst_rank, 1,spin_tmp); 
           cout<<"[MASTER] I receive spin_tmp status "<<endl; 

           mpi_world.recv(dst_rank, 2,energy_tmp); 
           cout<<"[MASTER] I receive energy_tmp status "<<endl; 

           tuple<int,int,int,int,int,int> periodicity_now; 

           mpi_world.recv(dst_rank, 3,periodicity_now); 
           cout<<"[MASTER] I receive periodicity_now status "<<endl; 



           spin_periodic[periodicity_now]=spin_tmp; 
           energy_periodic[periodicity_now]=energy_tmp; 

           cout<<"[MASTER] I updated spin_periodic and energy_periodic "<<endl; 


           { 
            double dump1; 
            findminmax(energy_periodic, min_bound, dump1); 
            matchnumber(energy_periodic, min_bound, min_list); 

            if (very_verbose){ 
             cout << "\nUB: " << min_bound << ", LB: " << lowerbound_from_compat << "; Periodicity: "; 
             printvector(min_list); 
             cout << endl; 
            } 

            if (min_bound <= lowerbound_from_compat+1e-5*max_value_of_J) { 
             done=true; 
            } 

           } 

           cout<<"[MASTER] I updated minlist and minbound "<<endl; 


          } 
          // Tell the slave that it can exit. 
          bool stop = true; 
          mpi_world.send(dst_rank, 0, stop); 
          killed_slave.insert(dst_rank); 
          cout<<"[MASTER] I stopped slave "<<dst_rank<<endl; 

         } 
         else { 
          all_done = false; 
         } 
        } 


       } 
       usleep(1000); 
      } 
      std::cout << "[MASTER] Handled all jobs, killed every process.\n"; 
     } 



    } 
}

代碼的邏輯不應有任何問題，因爲它運行在Mac上使用16個內核完全沒問題。不過，神祕的分割發生故障時，我低至8牛仔超級計算機上運行它（有時甚至核心）...這是很神祕......

來源

2016-02-25 user40780

基於兩條線從外部調試代碼是不可能的。踏踏實實地安裝了一個並行調試器。繼續 - 使用它。您也可以在這裏發佈[最小，完整和可驗證示例]（http://stackoverflow.com/help/mcve）。 – Zulan

我編輯並提供了一個最小版本的代碼 – user40780

我覺得基本上這是不可能的調試......因爲我嘗試在NERSC同一程序，240個內核上運行沒有任何問題....

來源

2016-02-26 21:33:58 user40780

回答

相關問題