我正在寫使用MPI C++結合(MPICH 2)MPI PROGRAMM。在調用MPI::COMM::Create
方法(所有正在運行的進程終止)後,在下面列出的特定數量的調用函數之後,我立即發生死鎖。
LOCAL_COMM = LOCAL_COMM.Create(localGroup)
在代碼中,需要更具體。
我的猜測是出現了某種溢出,但我是MPI編程的新手,無法找到問題的根源。預先感謝您的幫助。
代碼:
Matrix Matrix::operator* (Matrix &M)
{
//Synchronize all threads
//COMM_WORLD.Barrier();
int hisCol = M.myCol;
int commSize = COMM_WORLD.Get_size();
//Create local Group and Communicator equal to COMM_WORLD
Intracomm LOCAL_COMM = COMM_WORLD.Dup();
Group localGroup(LOCAL_COMM.Get_group());
//Change locals if required
if(commSize > myRow) {
//Make array of ranks to include
vector<int> masRanks(myRow);
for(int i = 0; i < myRow; i++)
masRanks[i] = i;
//Create new localGroup and LOCAL_COMM
localGroup = localGroup.Incl(myRow,&masRanks[0]);
LOCAL_COMM = LOCAL_COMM.Create(localGroup);
if(LOCAL_COMM == COMM_NULL) {
localGroup.Free();
return Matrix(myRow,hisCol);
}
commSize = LOCAL_COMM.Get_size();
}
//Calculate number of Rows displacement and amount to Send per Thread
vector<int> masRows(commSize,0);
vector<int> amntToSend(commSize,0), amntToRecv(commSize,0);
int tmpRows = myRow;
for(int i = commSize; i > 0; i--) {
masRows[i-1] = ((double)tmpRows/i + 0.5);
tmpRows -= masRows[i-1];
amntToSend[i-1] = masRows[i-1] * myCol;
amntToRecv[i-1] = masRows[i-1] * hisCol;
}
//Form array specifying displacement to send to each thread
vector<int> elemDisp(1,0), elemRecvDisp(1,0);
if(myRank == 0)
for(int i = 1; i < commSize; i++) {
elemDisp.push_back(amntToSend[i-1] + elemDisp[i-1]);
elemRecvDisp.push_back(amntToRecv[i-1] + elemRecvDisp[i-1]);
}
//Prepare variables and send/receive left and right matrix
int maxRow = *max_element(masRows.begin(),masRows.end());
Matr tmpData(maxRow*myCol,0);
LOCAL_COMM.Scatterv(&data[0],&amntToSend[0],&elemDisp[0],DOUBLE,&tmpData[0],masRows[myRank]*myCol,DOUBLE,0);
LOCAL_COMM.Bcast(&M.data[0],M.myRow*hisCol,DOUBLE,0);
//Start evaluation of each element
Matr resData(masRows[myRank]*hisCol,0);
for(int row = 0; row < masRows[myRank]; row++)
for(int i = 0; i < hisCol; i++)
for(int j = 0; j < myCol; j++)
resData[row*hisCol + i] += tmpData[row*myCol + j] * M.data[j*hisCol + i];
//Gather all data at 0-thread
Matr outData(myRow*hisCol);
LOCAL_COMM.Gatherv(&resData[0],masRows[myRank]*hisCol,DOUBLE,&outData[0],&amntToRecv[0],&elemRecvDisp[0],DOUBLE,0);
//Return result Matrix-object
Matrix resMatr(myRow, hisCol);
resMatr.data = outData;
//Free group and communicator
localGroup.Free();
LOCAL_COMM.Free();
return resMatr;
}
什麼是 「MATR」,而這是什麼構造呢? 'Matr tmpData(maxRow * myCol,0);'。這可能是一個問題,取決於0意味着什麼。我建議你使用valgrind運行這個應用程序,你將能夠發現緩衝區溢出,並且另一個內存容易以這種方式出現問題。 – mfontanini 2012-03-19 18:44:14
typedef vector Matr; –
Bonsav
2012-03-19 18:48:37
'myRow'從哪裏來?你能確定所有的MPI過程爲'myRow'共享相同的值嗎?否則,它們不會構建相同的本地組,並且對'MPI_Comm_create'的調用將失敗。 – Francesco 2012-03-19 20:04:46