2015-06-22 33 views
1

我是Trilinos的新手。 我寫了一個程序,它從文件中讀取數據並用數據構造一個稀疏矩陣。 全局ID可以非常高,超過32位整數範圍。 當我將id更改爲較小的值時。一切正常。 如果我運行一個進程,即mpiexec -np 1 ./myprogram,那就OK了。 但是,多個進程崩潰。Trilinos MPI環境中的大ID錯誤

數據是這樣的: 文件中的每一行代表矩陣中的一行。 在每一行中: 第一個值是行ID,第二個值表示此行中的列數。之後,有幾個索引值對。 實施例文件(小內徑):

2000791 3 2000791 0.5 1000791 0.5 3000791 1.0 
    1000791 2 1000791 0.5 2000791 0.5 
    3000791 2 3000791 0.5 1000791 0.5 
    3000792 2 3000791 0.5 1000791 0.5 

實施例文件(大內徑):

2000000000000791 3 2000000000000791 0.5 1000000000000791 0.5 3000000000000791 1.0 
    1000000000000791 2 1000000000000791 0.5 2000000000000791 0.5 
    3000000000000791 2 3000000000000791 0.5 1000000000000791 0.5 
    3000000000000792 2 3000000000000791 0.5 1000000000000791 0.5 

從gdb的輸出和Trilinos的源代碼,它似乎是錯誤是由Epetra_BlockMap::ConstructAutoUniform: Error. Not enough space for elements on each processor引起的。

下面的調試消息和我的程序的源代碼已附加。

#0 0x00007ffff58b55c9 in raise() from /lib64/libc.so.6 
#1 0x00007ffff58b6cd8 in abort() from /lib64/libc.so.6 
#2 0x00007ffff61b99d5 in __gnu_cxx::__verbose_terminate_handler()() 
    from /lib64/libstdc++.so.6 
#3 0x00007ffff61b7946 in ??() from /lib64/libstdc++.so.6 
#4 0x00007ffff61b7973 in std::terminate()() from /lib64/libstdc++.so.6 
#5 0x00007ffff61b7b9f in __cxa_throw() from /lib64/libstdc++.so.6 
#6 0x00000000004c6d2a in Epetra_BlockMap::ConstructAutoUniform (
    [email protected]=0x85cf00, 
    [email protected]=2000000000000002, 
    [email protected]=1, 
    [email protected]=1000000000000791, comm=..., 
    [email protected]=true) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:81 
#7 0x00000000004c708e in Epetra_BlockMap::Epetra_BlockMap (this=0x85cf00, 
    NumGlobal_Elements=2000000000000002, Element_Size=1, 
    Index_Base=1000000000000791, comm=...) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:124 
#8 0x0000000000497de9 in Epetra_Map::Epetra_Map (this=0x85cf00, 
    numGlobalElements=<optimized out>, indexBase=<optimized out>, comm=...) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetr---Type <return> to continue, or q <return> to quit--- 
a_Map.cpp:68 
#9 0x00000000004c008f in Epetra_BasicDirectory::Generate<long long> (
    this=0x85cea0, Map=...) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:276 
#10 0x00000000004bf4c5 in Epetra_BasicDirectory::Epetra_BasicDirectory (
    this=0x85cea0, Map=..., __in_chrg=<optimized out>, 
    __vtt_parm=<optimized out>) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:121 
#11 0x00000000004b1ea1 in Epetra_MpiComm::CreateDirectory (
    this=<optimized out>, map=...) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_MpiComm.cpp:243 
#12 0x00000000004c5fcc in Epetra_BlockMap::RemoteIDList (
    [email protected]=0x7fffffffddd0, [email protected]=0, GIDList=0x0, 
    PIDList=0x0, [email protected]=0x0, SizeList=0x0) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:1336 
#13 0x00000000004d741b in Epetra_CrsGraph::MakeColMap_LL (
    [email protected]=0x7fffffffddc0, domainMap=..., rangeMap=...) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1664 
---Type <return> to continue, or q <return> to quit--- 
#14 0x00000000004d81c9 in Epetra_CrsGraph::MakeColMap (
    [email protected]=0x7fffffffddc0, domainMap=..., rangeMap=...) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1764 
#15 0x00000000004d83e7 in Epetra_CrsGraph::MakeIndicesLocal (
    [email protected]=0x7fffffffddc0, domainMap=..., rangeMap=...) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1784 
#16 0x0000000000462bcb in Epetra_CrsMatrix::FillComplete (
    [email protected]=0x7fffffffdd50, domain_map=..., range_map=..., 
    [email protected]=true) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1159 
#17 0x0000000000462e81 in Epetra_CrsMatrix::FillComplete (
    [email protected]=0x7fffffffdd50, 
    [email protected]=true) 
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1142 
#18 0x000000000045a02c in read_and_construct (part=2, total_parts=4) 
    at /home/myusername/mpi/myprogram/main.cpp:99 
#19 0x0000000000458905 in main (argc=1, argv=0x7fffffffdfe8) 

程序的源代碼:

#include <Epetra_config.h> 

#ifdef HAVE_MPI 
# include <mpi.h> 
# include <Epetra_MpiComm.h> 
#else 
# include <Epetra_SerialComm.h> 
#endif // HAVE_MPI 

#include <Epetra_Map.h> 
#include <Epetra_CrsMatrix.h> 
#include <Epetra_Vector.h> 
#include <Epetra_Version.h> 
#include <Epetra_DistObject.h> 
#include <Epetra_Export.h> 
#include <Epetra_Util.h> 
#include <unistd.h> 

#include <stdexcept> 


//std libs 
#include <cstdio> 
#include <vector> 


using namespace std; 

typedef long long global_ordinal_type; 



int pid; 
int np; 

char *path = "/home/tianxiaochen01/matrix_small.txt"; 
typedef long long LL; 
typedef long long * T_LLP; 


#ifdef HAVE_MPI 
    Epetra_MpiComm * comm; 
#else 
    Epetra_SerialComm* comm; 
#endif 

// C style 
void read_and_construct(int part,int total_parts){ 
    FILE * matrixfile; 
    matrixfile = fopen(path,"r"); 
    int len = 0; 
    long long src; 
    vector<T_LLP> arrdst; 
    vector< double * > arrvalue; 
    vector<LL> myids; 
    vector<int> lens; 

    while (fscanf(matrixfile,"%lld %d ",&src,&len) != EOF){ 
     T_LLP dsts = new LL [ len ]; 
     double * values = new double [ len ]; 
     long long dst; 
     double value; 
     for (int i=0;i<len;i++){ 
      fscanf(matrixfile,"%lld %lf",&dst,&value); 
      dsts[i] = dst; 
      values[i] = value; 
     } 
     if (src % (LL)total_parts == (LL)part ) {//is my part 
      myids.push_back(src); 
      lens.push_back(len); 
      arrdst.push_back(dsts); 
      arrvalue.push_back(values); 
     } 
     else { 
      delete [] dsts; 
      delete [] values; 
     } 
    } 

    fclose(matrixfile); 

    T_LLP arrmap = new LL [ myids.size() ]; 
    for (int i=0;i<myids.size();i++){ 
     arrmap[i] = myids[i]; 
    } 
    Epetra_Map map((LL)-1, (int)myids.size(), arrmap ,(LL)0, *comm); 

    Epetra_Vector v1(map); 
    int avg_col_size = 1000; 


    Epetra_CrsMatrix M(Copy,map,avg_col_size); 
    //insert values into matrix 
    for (int i=0;i<myids.size();i++){ 
     // (long long GlobalRow, int NumEntries, const double *Values, const long long *Indices) 
     int e = M.InsertGlobalValues(myids[i],lens[i],arrvalue[i],arrdst[i]); 
    } 

    try 
    { 
     M.FillComplete(); 
    } catch (const Epetra_Object& ex) { 
     cout<<"ERROR"<<endl; 
     cout<<ex<<endl; 
    } 
    cout<<M<<endl; 

} 

void init(const Epetra_Comm& comm){ 
    pid = comm.MyPID(); 
    np = comm.NumProc(); 

} 


int 
main (int argc, char *argv[]) 
{ 
    using std::cout; 
    using std::endl; 

#ifdef HAVE_MPI 
    MPI_Init (&argc, &argv); 
    comm = new Epetra_MpiComm (MPI_COMM_WORLD); 
    init(*comm); 
#else 
    comm = new Epetra_SerialComm; 
    pid = 0; 
    np = 1; 
#endif // HAVE_MPI 

    read_and_construct(pid,np); 

#ifdef HAVE_MPI 
    (void) MPI_Finalize(); 
#endif // HAVE_MPI 

    return 0; 
} 

Trilinos版本:12.0 MPI:MPICH

回答

1

這聽起來像是兩件事情。一個MPI_INT是一個普通的c int,幾乎在每個32位的平臺上。所以如果允許巨人標識符,trillinios將不得不將它們發送給更新的人(MPI-2,第10.2.5節)MPI_INT64_T類型

其次,也許Trillinios發送這些信息時確定,但是當您看到「錯誤,每個處理器上的元素沒有足夠的空間「,這表明Trillinios正在分配一個密集陣列來保存這些較大的值,並且您的進程內存不足。單處理器的情況可能有效,因爲沒有中間節點擔心。