我試圖使用OpenMPI和C來計算NxN矩陣乘法。除了MPI_Bcast()
之外,一切都按預期運行。據我瞭解,MASTER必須將matrix_2
廣播給WORKER進程的其餘部分。同時,當工作人員到達MPI_Bcast()
時,他們應該在那裏等待,直到選定的過程(在這種情況下MASTER)進行廣播。Open MPI的矩陣乘法錯誤
我得到的錯誤是分段故障和地址沒有映射,所以它肯定有事情做與矩陣的動態分配。我所做的是將matrix_1
的部分發送給每個進程,然後它們中的每一個隨後對之前廣播的matrix_2
進行部分乘法和添加。
我知道錯誤必須在MPI_Bcast()
上,因爲當我評論它時,程序正確結束(但顯然沒有計算產品)。必須有一些我沒有意識到的東西。我留下了我得到的代碼和錯誤消息。先謝謝了。
CODE
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
/* MACROS */
#define MASTER_TO_SLAVE_TAG 1
#define SLAVE_TO_MASTER_TAG 4
#define MASTER 0
#define WORKER 1
int *matrix_1;
int *matrix_2;
int *result;
double start_time;
double end_time;
int procID;
int numProc;
int size, numRows, from, to;
int i,j,k;
MPI_Status status;
MPI_Request request;
void addressMatrixMemory(int);
int main(int argc, char *argv[]){
size = atoi(argv[1]);
MPI_Init (&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &procID);
MPI_Comm_size(MPI_COMM_WORLD, &numProc);
addressMatrixMemory(size);
/* MASTER starts. */
if(procID == MASTER){
start_time = MPI_Wtime();
for(i = 1; i < numProc; i++){
numRows = size/(numProc - 1);
from = (i - 1) * numRows;
if(((i + 1) == numProc) && ((size % (numProc - 1))) != 0){
to = size;
} else {
to = from + numRows;
}
MPI_Isend(&from, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &request);
MPI_Isend(&to, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &request);
MPI_Isend(matrix_1, (to - from) * size, MPI_INT, i, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &request);
}
}
MPI_Bcast(&matrix_2, size * size, MPI_INT, MASTER, MPI_COMM_WORLD);
/* WORKERS task */
if(procID >= WORKER){
int row, col;
int *matrix = malloc(sizeof(matrix_1[0])*size*size);
MPI_Recv(&from, 1, MPI_INT, MASTER, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&to, 1, MPI_INT, MASTER, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &status);
MPI_Recv(matrix, (to - from) * size, MPI_INT, MASTER, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &status);
for(row = from; row < to; row++){
for(col = 0; col < size; col++){
result[row * size + col] = 0;
for(k = 0; k < size; k++);
result[row * size + col] += matrix[row * size + k] * matrix_2[k * size + col];
}
}
MPI_Isend(&from, 1, MPI_INT, MASTER, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &request);
MPI_Isend(&to, 1, MPI_INT, MASTER, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &request);
MPI_Isend(&result[from], (to - from) * size, MPI_INT, MASTER, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &request);
}
/* MASTER gathers WORKERS job. */
if(procID == MASTER){
for(i = 1; i < numProc; i++){
MPI_Recv(&from, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &status);
MPI_Recv(&to, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &status);
MPI_Recv(&result[from], (to - from) * size, MPI_INT, i, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &status);
}
end_time = MPI_Wtime();
printf("\nRunning Time = %f\n\n", end_time - start_time);
}
MPI_Finalize();
free(matrix_1);
free(matrix_2);
free(result);
return EXIT_SUCCESS;
}
void addressMatrixMemory(int n){
matrix_1 = malloc(sizeof(matrix_1[0])*n*n);
matrix_2 = malloc(sizeof(matrix_2[0])*n*n);
result = malloc(sizeof(result[0])*n*n);
/* Matrix init with values between 1 y 100. */
srand(time(NULL));
int r = rand() % 100 + 1;
int i;
for(i = 0; i < n*n; i++){
matrix_1[i] = r;
r = rand() % 100 + 1;
matrix_2[i] = r;
r = rand() % 100 + 1;
}
}
錯誤信息
[tuliansPC:28270] *** Process received signal ***
[tuliansPC:28270] Signal: Segmentation fault (11)
[tuliansPC:28270] Signal code: Address not mapped (1)
[tuliansPC:28270] Failing at address: 0x603680
[tuliansPC:28270] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x10340) [0x7f0a98ce0340]
[tuliansPC:28270] [ 1] /lib/x86_64-linux-gnu/libc.so.6(+0x97ffe) [0x7f0a9899fffe]
[tuliansPC:28270] [ 2] /usr/lib/libmpi.so.1(opal_convertor_pack+0x129) [0x7f0a98fef779]
[tuliansPC:28270] [ 3] /usr/lib/openmpi/lib/openmpi/mca_btl_sm.so(mca_btl_sm_prepare_src+0x1fd) [0x7f0a923c385d]
[tuliansPC:28270] [ 4] /usr/lib/openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_start_rndv+0x1dc) [0x7f0a93245c9c]
[tuliansPC:28270] [ 5] /usr/lib/openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_isend+0x8ec) [0x7f0a9323856c]
[tuliansPC:28270] [ 6] /usr/lib/openmpi/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_bcast_intra_generic+0x3fc) [0x7f0a914f49fc]
[tuliansPC:28270] [ 7] /usr/lib/openmpi/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_bcast_intra_pipeline+0xbc) [0x7f0a914f4d5c]
[tuliansPC:28270] [ 8] /usr/lib/openmpi/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_bcast_intra_dec_fixed+0x134) [0x7f0a914ec7a4]
[tuliansPC:28270] [ 9] /usr/lib/openmpi/lib/openmpi/mca_coll_sync.so(mca_coll_sync_bcast+0x64) [0x7f0a917096a4]
[tuliansPC:28270] [10] /usr/lib/libmpi.so.1(MPI_Bcast+0x13d) [0x7f0a98f5678d]
[tuliansPC:28270] [11] ej5Exec() [0x400e8c]
[tuliansPC:28270] [12] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf5) [0x7f0a98929ec5]
[tuliansPC:28270] [13] ej5Exec() [0x400ac9]
[tuliansPC:28270] *** End of error message ***
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 28270 on node tuliansPC exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
我試着在每個'MPI_Isend()'後添加'MPI_Wait()',但是我得到了相同的錯誤。 – tulians