0

我試圖使用OpenMPI和C來計算NxN矩陣乘法。除了MPI_Bcast()之外,一切都按預期運行。據我瞭解,MASTER必須將matrix_2廣播給WORKER進程的其餘部分。同時,當工作人員到達MPI_Bcast()時,他們應該在那裏等待,直到選定的過程(在這種情況下MASTER)進行廣播。Open MPI的矩陣乘法錯誤

我得到的錯誤是分段故障地址沒有映射,所以它肯定有事情做與矩陣的動態分配。我所做的是將matrix_1的部分發送給每個進程,然後它們中的每一個隨後對之前廣播的matrix_2進行部分乘法和添加。

我知道錯誤必須在MPI_Bcast()上,因爲當我評論它時,程序正確結束(但顯然沒有計算產品)。必須有一些我沒有意識到的東西。我留下了我得到的代碼和錯誤消息。先謝謝了。

CODE

#include <mpi.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <time.h> 
/* MACROS */ 
#define MASTER_TO_SLAVE_TAG 1 
#define SLAVE_TO_MASTER_TAG 4 
#define MASTER 0 
#define WORKER 1 

int *matrix_1; 
int *matrix_2; 

int *result; 

double start_time; 
double end_time; 

int procID; 
int numProc; 

int size, numRows, from, to; 
int i,j,k; 

MPI_Status status; 
MPI_Request request; 

void addressMatrixMemory(int); 

int main(int argc, char *argv[]){ 
    size = atoi(argv[1]); 
    MPI_Init (&argc, &argv); 
    MPI_Comm_rank(MPI_COMM_WORLD, &procID); 
    MPI_Comm_size(MPI_COMM_WORLD, &numProc); 

    addressMatrixMemory(size); 

    /* MASTER starts. */ 
    if(procID == MASTER){ 

     start_time = MPI_Wtime(); 
     for(i = 1; i < numProc; i++){ 
      numRows = size/(numProc - 1); 

      from = (i - 1) * numRows; 

      if(((i + 1) == numProc) && ((size % (numProc - 1))) != 0){ 
       to = size; 
      } else { 
       to = from + numRows; 
      } 
      MPI_Isend(&from, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &request); 

      MPI_Isend(&to, 1, MPI_INT, i, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &request); 

      MPI_Isend(matrix_1, (to - from) * size, MPI_INT, i, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &request); 

     } 

    } 

    MPI_Bcast(&matrix_2, size * size, MPI_INT, MASTER, MPI_COMM_WORLD); 

    /* WORKERS task */ 

    if(procID >= WORKER){ 
     int row, col; 
     int *matrix = malloc(sizeof(matrix_1[0])*size*size); 

     MPI_Recv(&from, 1, MPI_INT, MASTER, MASTER_TO_SLAVE_TAG, MPI_COMM_WORLD, &status); 

     MPI_Recv(&to, 1, MPI_INT, MASTER, MASTER_TO_SLAVE_TAG + 1, MPI_COMM_WORLD, &status); 

     MPI_Recv(matrix, (to - from) * size, MPI_INT, MASTER, MASTER_TO_SLAVE_TAG + 2, MPI_COMM_WORLD, &status); 
     for(row = from; row < to; row++){ 
      for(col = 0; col < size; col++){ 
       result[row * size + col] = 0; 
       for(k = 0; k < size; k++); 
        result[row * size + col] += matrix[row * size + k] * matrix_2[k * size + col]; 
      } 
     } 
     MPI_Isend(&from, 1, MPI_INT, MASTER, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &request); 
     MPI_Isend(&to, 1, MPI_INT, MASTER, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &request); 
     MPI_Isend(&result[from], (to - from) * size, MPI_INT, MASTER, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &request); 
    } 

    /* MASTER gathers WORKERS job. */ 
    if(procID == MASTER){ 
     for(i = 1; i < numProc; i++){ 
      MPI_Recv(&from, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG, MPI_COMM_WORLD, &status); 
      MPI_Recv(&to, 1, MPI_INT, i, SLAVE_TO_MASTER_TAG + 1, MPI_COMM_WORLD, &status); 
      MPI_Recv(&result[from], (to - from) * size, MPI_INT, i, SLAVE_TO_MASTER_TAG + 2, MPI_COMM_WORLD, &status); 
     } 

     end_time = MPI_Wtime(); 
     printf("\nRunning Time = %f\n\n", end_time - start_time); 
    } 

    MPI_Finalize(); 

    free(matrix_1); 
    free(matrix_2); 
    free(result); 

    return EXIT_SUCCESS; 
} 

void addressMatrixMemory(int n){ 
    matrix_1 = malloc(sizeof(matrix_1[0])*n*n); 
    matrix_2 = malloc(sizeof(matrix_2[0])*n*n); 

    result = malloc(sizeof(result[0])*n*n); 

    /* Matrix init with values between 1 y 100. */ 
    srand(time(NULL)); 
    int r = rand() % 100 + 1; 
    int i; 
    for(i = 0; i < n*n; i++){ 
     matrix_1[i] = r; 
     r = rand() % 100 + 1; 
     matrix_2[i] = r; 
     r = rand() % 100 + 1; 
    } 
} 

錯誤信息

[tuliansPC:28270] *** Process received signal *** 
[tuliansPC:28270] Signal: Segmentation fault (11) 
[tuliansPC:28270] Signal code: Address not mapped (1) 
[tuliansPC:28270] Failing at address: 0x603680 
    [tuliansPC:28270] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x10340) [0x7f0a98ce0340] 
[tuliansPC:28270] [ 1] /lib/x86_64-linux-gnu/libc.so.6(+0x97ffe) [0x7f0a9899fffe] 
[tuliansPC:28270] [ 2] /usr/lib/libmpi.so.1(opal_convertor_pack+0x129) [0x7f0a98fef779] 
[tuliansPC:28270] [ 3] /usr/lib/openmpi/lib/openmpi/mca_btl_sm.so(mca_btl_sm_prepare_src+0x1fd) [0x7f0a923c385d] 
[tuliansPC:28270] [ 4] /usr/lib/openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_start_rndv+0x1dc) [0x7f0a93245c9c] 
[tuliansPC:28270] [ 5] /usr/lib/openmpi/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_isend+0x8ec) [0x7f0a9323856c] 
[tuliansPC:28270] [ 6] /usr/lib/openmpi/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_bcast_intra_generic+0x3fc) [0x7f0a914f49fc] 
[tuliansPC:28270] [ 7] /usr/lib/openmpi/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_bcast_intra_pipeline+0xbc) [0x7f0a914f4d5c] 
[tuliansPC:28270] [ 8] /usr/lib/openmpi/lib/openmpi/mca_coll_tuned.so(ompi_coll_tuned_bcast_intra_dec_fixed+0x134) [0x7f0a914ec7a4] 
[tuliansPC:28270] [ 9] /usr/lib/openmpi/lib/openmpi/mca_coll_sync.so(mca_coll_sync_bcast+0x64) [0x7f0a917096a4] 
[tuliansPC:28270] [10] /usr/lib/libmpi.so.1(MPI_Bcast+0x13d) [0x7f0a98f5678d] 
[tuliansPC:28270] [11] ej5Exec() [0x400e8c] 
[tuliansPC:28270] [12] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf5) [0x7f0a98929ec5] 
[tuliansPC:28270] [13] ej5Exec() [0x400ac9] 
[tuliansPC:28270] *** End of error message *** 
-------------------------------------------------------------------------- 
mpirun noticed that process rank 0 with PID 28270 on node tuliansPC exited on signal 11 (Segmentation fault). 
-------------------------------------------------------------------------- 

回答

0

讓我們開始與跳出的第一個問題。您正在使用非阻塞通信。 MPI_Isend是一種非阻塞發送功能,這意味着當您撥打MPI_Isend時,您所做的只是告訴MPI關於您希望在未來某個時間發送的消息。它可能會發送正確的,它可能不會。爲了保證數據的實際發送,您需要使用類似MPI_Wait的方式完成呼叫。通常當人們使用非阻塞呼叫(MPI_Isend)時,他們不會將其與阻止呼叫(MPI_Recv)混合使用。如果您使用所有非阻塞呼叫,則可以使用單個功能完成所有這些呼叫,MPI_Waitall

嘗試先解決這些問題,看看能否解決您的問題。僅僅因爲你評論了集體,並不意味着其他問題不存在。由於這種奇怪的行爲,MPI程序可能會出現難以調試的問題。

+0

我試着在每個'MPI_Isend()'後添加'MPI_Wait()',但是我得到了相同的錯誤。 – tulians