我是Trilinos的新手。 我寫了一個程序,它從文件中讀取數據並用數據構造一個稀疏矩陣。 全局ID可以非常高,超過32位整數範圍。 當我將id更改爲較小的值時。一切正常。 如果我運行一個進程,即mpiexec -np 1 ./myprogram
,那就OK了。 但是,多個進程崩潰。Trilinos MPI環境中的大ID錯誤
數據是這樣的: 文件中的每一行代表矩陣中的一行。 在每一行中: 第一個值是行ID,第二個值表示此行中的列數。之後,有幾個索引值對。 實施例文件(小內徑):
2000791 3 2000791 0.5 1000791 0.5 3000791 1.0
1000791 2 1000791 0.5 2000791 0.5
3000791 2 3000791 0.5 1000791 0.5
3000792 2 3000791 0.5 1000791 0.5
實施例文件(大內徑):
2000000000000791 3 2000000000000791 0.5 1000000000000791 0.5 3000000000000791 1.0
1000000000000791 2 1000000000000791 0.5 2000000000000791 0.5
3000000000000791 2 3000000000000791 0.5 1000000000000791 0.5
3000000000000792 2 3000000000000791 0.5 1000000000000791 0.5
從gdb的輸出和Trilinos的源代碼,它似乎是錯誤是由Epetra_BlockMap::ConstructAutoUniform: Error. Not enough space for elements on each processor
引起的。
下面的調試消息和我的程序的源代碼已附加。
#0 0x00007ffff58b55c9 in raise() from /lib64/libc.so.6
#1 0x00007ffff58b6cd8 in abort() from /lib64/libc.so.6
#2 0x00007ffff61b99d5 in __gnu_cxx::__verbose_terminate_handler()()
from /lib64/libstdc++.so.6
#3 0x00007ffff61b7946 in ??() from /lib64/libstdc++.so.6
#4 0x00007ffff61b7973 in std::terminate()() from /lib64/libstdc++.so.6
#5 0x00007ffff61b7b9f in __cxa_throw() from /lib64/libstdc++.so.6
#6 0x00000000004c6d2a in Epetra_BlockMap::ConstructAutoUniform (
[email protected]=0x85cf00,
[email protected]=2000000000000002,
[email protected]=1,
[email protected]=1000000000000791, comm=...,
[email protected]=true)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:81
#7 0x00000000004c708e in Epetra_BlockMap::Epetra_BlockMap (this=0x85cf00,
NumGlobal_Elements=2000000000000002, Element_Size=1,
Index_Base=1000000000000791, comm=...)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:124
#8 0x0000000000497de9 in Epetra_Map::Epetra_Map (this=0x85cf00,
numGlobalElements=<optimized out>, indexBase=<optimized out>, comm=...)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetr---Type <return> to continue, or q <return> to quit---
a_Map.cpp:68
#9 0x00000000004c008f in Epetra_BasicDirectory::Generate<long long> (
this=0x85cea0, Map=...)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:276
#10 0x00000000004bf4c5 in Epetra_BasicDirectory::Epetra_BasicDirectory (
this=0x85cea0, Map=..., __in_chrg=<optimized out>,
__vtt_parm=<optimized out>)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:121
#11 0x00000000004b1ea1 in Epetra_MpiComm::CreateDirectory (
this=<optimized out>, map=...)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_MpiComm.cpp:243
#12 0x00000000004c5fcc in Epetra_BlockMap::RemoteIDList (
[email protected]=0x7fffffffddd0, [email protected]=0, GIDList=0x0,
PIDList=0x0, [email protected]=0x0, SizeList=0x0)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:1336
#13 0x00000000004d741b in Epetra_CrsGraph::MakeColMap_LL (
[email protected]=0x7fffffffddc0, domainMap=..., rangeMap=...)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1664
---Type <return> to continue, or q <return> to quit---
#14 0x00000000004d81c9 in Epetra_CrsGraph::MakeColMap (
[email protected]=0x7fffffffddc0, domainMap=..., rangeMap=...)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1764
#15 0x00000000004d83e7 in Epetra_CrsGraph::MakeIndicesLocal (
[email protected]=0x7fffffffddc0, domainMap=..., rangeMap=...)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1784
#16 0x0000000000462bcb in Epetra_CrsMatrix::FillComplete (
[email protected]=0x7fffffffdd50, domain_map=..., range_map=...,
[email protected]=true)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1159
#17 0x0000000000462e81 in Epetra_CrsMatrix::FillComplete (
[email protected]=0x7fffffffdd50,
[email protected]=true)
at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1142
#18 0x000000000045a02c in read_and_construct (part=2, total_parts=4)
at /home/myusername/mpi/myprogram/main.cpp:99
#19 0x0000000000458905 in main (argc=1, argv=0x7fffffffdfe8)
程序的源代碼:
#include <Epetra_config.h>
#ifdef HAVE_MPI
# include <mpi.h>
# include <Epetra_MpiComm.h>
#else
# include <Epetra_SerialComm.h>
#endif // HAVE_MPI
#include <Epetra_Map.h>
#include <Epetra_CrsMatrix.h>
#include <Epetra_Vector.h>
#include <Epetra_Version.h>
#include <Epetra_DistObject.h>
#include <Epetra_Export.h>
#include <Epetra_Util.h>
#include <unistd.h>
#include <stdexcept>
//std libs
#include <cstdio>
#include <vector>
using namespace std;
typedef long long global_ordinal_type;
int pid;
int np;
char *path = "/home/tianxiaochen01/matrix_small.txt";
typedef long long LL;
typedef long long * T_LLP;
#ifdef HAVE_MPI
Epetra_MpiComm * comm;
#else
Epetra_SerialComm* comm;
#endif
// C style
void read_and_construct(int part,int total_parts){
FILE * matrixfile;
matrixfile = fopen(path,"r");
int len = 0;
long long src;
vector<T_LLP> arrdst;
vector< double * > arrvalue;
vector<LL> myids;
vector<int> lens;
while (fscanf(matrixfile,"%lld %d ",&src,&len) != EOF){
T_LLP dsts = new LL [ len ];
double * values = new double [ len ];
long long dst;
double value;
for (int i=0;i<len;i++){
fscanf(matrixfile,"%lld %lf",&dst,&value);
dsts[i] = dst;
values[i] = value;
}
if (src % (LL)total_parts == (LL)part ) {//is my part
myids.push_back(src);
lens.push_back(len);
arrdst.push_back(dsts);
arrvalue.push_back(values);
}
else {
delete [] dsts;
delete [] values;
}
}
fclose(matrixfile);
T_LLP arrmap = new LL [ myids.size() ];
for (int i=0;i<myids.size();i++){
arrmap[i] = myids[i];
}
Epetra_Map map((LL)-1, (int)myids.size(), arrmap ,(LL)0, *comm);
Epetra_Vector v1(map);
int avg_col_size = 1000;
Epetra_CrsMatrix M(Copy,map,avg_col_size);
//insert values into matrix
for (int i=0;i<myids.size();i++){
// (long long GlobalRow, int NumEntries, const double *Values, const long long *Indices)
int e = M.InsertGlobalValues(myids[i],lens[i],arrvalue[i],arrdst[i]);
}
try
{
M.FillComplete();
} catch (const Epetra_Object& ex) {
cout<<"ERROR"<<endl;
cout<<ex<<endl;
}
cout<<M<<endl;
}
void init(const Epetra_Comm& comm){
pid = comm.MyPID();
np = comm.NumProc();
}
int
main (int argc, char *argv[])
{
using std::cout;
using std::endl;
#ifdef HAVE_MPI
MPI_Init (&argc, &argv);
comm = new Epetra_MpiComm (MPI_COMM_WORLD);
init(*comm);
#else
comm = new Epetra_SerialComm;
pid = 0;
np = 1;
#endif // HAVE_MPI
read_and_construct(pid,np);
#ifdef HAVE_MPI
(void) MPI_Finalize();
#endif // HAVE_MPI
return 0;
}
Trilinos版本:12.0 MPI:MPICH