2015-08-17 59 views
2

我正在嘗試將CUSP整合到現有的Fortran代碼中。現在,我只是試圖基本設置推力/ CUSP,以便從Fortran陣列中提取數據,並使用它們構建CUSP矩陣(現在爲coo格式)。 我已經能夠得到像C例程的包裝編譯成庫和Fortran代碼感謝與掛靠此主題:unresolved-references-using-ifort-with-nvcc-and-cusp如何從傳遞的陣列正確構造CUSP coo矩陣

而且我可以確認的Fortran正確的數組指針得益於幫助餵養前一個線程:Generating CUSP coo_matrix from passed FORTRAN arrays

不幸的是,我仍然無法獲得CUSP使用這些來生成和打印矩陣。 代碼和輸出如下所示:

輸出

$ ./fort_cusp_test 
testing 1 2 3 
n: 3, nnz: 9 
    i, row_i, col_j,  val_v 
    0,  1,  1, 1.0000e+00 
    1,  1,  2, 2.0000e+00 
    2,  1,  3, 3.0000e+00 
    3,  2,  1, 4.0000e+00 
    4,  2,  2, 5.0000e+00 
    5,  2,  3, 6.0000e+00 
    6,  3,  1, 7.0000e+00 
    7,  3,  2, 8.0000e+00 
    8,  3,  3, 9.0000e+00 
initialized row_i into thrust 
initialized col_j into thrust 
initialized val_v into thrust 
defined CUSP integer array view for row_i and col_j 
defined CUSP float array view for val_v 
loaded row_i into a CUSP integer array view 
loaded col_j into a CUSP integer array view 
loaded val_v into a CUSP float array view 
defined CUSP coo_matrix view 
Built matrix A from CUSP device views 
sparse matrix <3, 3> with 9 entries 
libc++abi.dylib: terminating with uncaught exception of type thrust::system::system_error: invalid argument 

Program received signal SIGABRT: Process abort signal. 

Backtrace for this error: 
#0 0x10d0fdff6 
#1 0x10d0fd593 
#2 0x7fff8593ff19 
Abort trap: 6 

fort_cusp_test.f90

program fort_cuda_test 

    implicit none 

! interface 
! subroutine test_coo_mat_print_(row_i,col_j,val_v,n,nnz) bind(C) 
!  use, intrinsic :: ISO_C_BINDING, ONLY: C_INT,C_FLOAT 
!  implicit none 
!  integer(C_INT),value :: n, nnz 
!  integer(C_INT) :: row_i(:), col_j(:) 
!  real(C_FLOAT) :: val_v(:) 
! end subroutine test_coo_mat_print_ 
! end interface 

    integer*4 n 
    integer*4 nnz 

    integer*4, target :: rowI(9),colJ(9) 
    real*4, target :: valV(9) 

    integer*4, pointer :: row_i(:) 
    integer*4, pointer :: col_j(:) 
    real*4, pointer :: val_v(:) 

    n  = 3 
    nnz = 9 
    rowI = (/ 1, 1, 1, 2, 2, 2, 3, 3, 3/) 
    colJ = (/ 1, 2, 3, 1, 2, 3, 1, 2, 3/) 
    valV = (/ 1, 2, 3, 4, 5, 6, 7, 8, 9/) 

    row_i => rowI 
    col_j => colJ 
    val_v => valV 

    write(*,*) "testing 1 2 3" 

    call test_coo_mat_print (rowI,colJ,valV,n,nnz) 

end program fort_cuda_test 

cusp_runner.cu

#include <stdio.h> 
#include <cusp/coo_matrix.h> 
#include <iostream> 
// #include <cusp/krylov/cg.h> 
#include <cusp/print.h> 

#if defined(__cplusplus) 
extern "C" { 
#endif 

void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ) { 

    int n, nnz; 

    n = *N; 
    nnz = *NNZ; 

    printf("n: %d, nnz: %d\n",n,nnz); 

    printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v"); 
    for(int i=0;i<n;i++) { 
     printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]); 
    } 
    //if (false) { 
    //wrap raw input pointers with thrust::device_ptr 
    thrust::device_ptr<int> wrapped_device_I(row_i); 
    printf("initialized row_i into thrust\n"); 
    thrust::device_ptr<int> wrapped_device_J(col_j); 
    printf("initialized col_j into thrust\n"); 
    thrust::device_ptr<float> wrapped_device_V(val_v); 
    printf("initialized val_v into thrust\n"); 

    //use array1d_view to wrap individual arrays 
    typedef typename cusp::array1d_view< thrust::device_ptr<int> > DeviceIndexArrayView; 
    printf("defined CUSP integer array view for row_i and col_j\n"); 
    typedef typename cusp::array1d_view< thrust::device_ptr<float> > DeviceValueArrayView; 
    printf("defined CUSP float array view for val_v\n"); 

    DeviceIndexArrayView row_indices(wrapped_device_I, wrapped_device_I + nnz); 
    printf("loaded row_i into a CUSP integer array view\n"); 
    DeviceIndexArrayView column_indices(wrapped_device_J, wrapped_device_J + nnz); 
    printf("loaded col_j into a CUSP integer array view\n"); 
    DeviceValueArrayView values(wrapped_device_V, wrapped_device_V + nnz); 
    printf("loaded val_v into a CUSP float array view\n"); 

    //combine array1d_views into coo_matrix_view 
    typedef cusp::coo_matrix_view<DeviceIndexArrayView,DeviceIndexArrayView,DeviceValueArrayView> DeviceView; 
    printf("defined CUSP coo_matrix view\n"); 

    //construct coo_matrix_view from array1d_views 
    DeviceView A(n,n,nnz,row_indices,column_indices,values); 
    printf("Built matrix A from CUSP device views\n"); 

    cusp::print(A); 
    printf("Printed matrix A\n"); 
//} 
} 
#if defined(__cplusplus) 
} 
#endif 

的Makefile

Test: 
    nvcc -Xcompiler="-fPIC" -shared cusp_runner.cu -o cusp_runner.so -I/Developer/NVIDIA/CUDA-6.5/include/cusp 
    gfortran -c fort_cusp_test.f90 
    gfortran fort_cusp_test.o cusp_runner.so -L/Developer/NVIDIA/CUDA-6.5/lib -lcudart -o fort_cusp_test 

clean: 
    rm *.o *.so fort_cusp_test 

cusp_runner.cu的功能版本:

#include <stdio.h> 
#include <cusp/coo_matrix.h> 
#include <iostream> 
// #include <cusp/krylov/cg.h> 
#include <cusp/print.h> 

#if defined(__cplusplus) 
extern "C" { 
#endif 

void test_coo_mat_print_(int * row_i, int * col_j, float * val_v, int * N, int * NNZ) { 

    int n, nnz; 

    n = *N; 
    nnz = *NNZ; 

    printf("n: %d, nnz: %d\n",n,nnz); 

    printf("printing input (row_i, col_j, val_v)\n"); 
    printf("%6s, %6s, %6s, %12s \n","i","row_i","col_j","val_v"); 
    for(int i=0;i<nnz;i++) { 
     printf("%6d, %6d, %6d, %12.4e\n",i,row_i[i],col_j[i],val_v[i]); 
    } 

    printf("initializing thrust device vectors\n"); 
    thrust::device_vector<int> device_I(row_i,row_i+nnz); 
    printf("device_I initialized\n"); 
    thrust::device_vector<int> device_J(col_j,col_j+nnz); 
    printf("device_J initialized\n"); 
    thrust::device_vector<float> device_V(val_v,val_v+nnz); 
    printf("device_V initialized\n"); 

    cusp::coo_matrix<int, float, cusp::device_memory> A(n,n,nnz); 
    printf("initialized empty CUSP coo_matrix on device\n"); 

    A.row_indices = device_I; 
    printf("loaded device_I into A.row_indices\n"); 
    A.column_indices = device_J; 
    printf("loaded device_J into A.column_indices\n"); 
    A.values = device_V; 
    printf("loaded device_V into A.values\n"); 

    cusp::print(A); 
    printf("Printed matrix A\n"); 
//} 
} 
#if defined(__cplusplus) 
} 
#endif 
+0

我對Thrust一無所知,但對於我來說,定義DeviceIndexArrayView的行有n,nnz,nnz,而定義DeviceView的行有n,n,nnz,這看起來很奇怪。這些是正確的嗎?另外,我想附上「cuda」標籤會很有用。 – roygvib

+0

沒有多少人訂閱[tag:fortran90],如果你需要注意,可以使用[tag:fortran]。此外,您註釋的界面是Fortran 2003,絕對不是90. –

+0

用於定義DeviceView的行絕對正確。我假設一個有n行,n列和nnz非零條目的矩形矩陣。我在row_indices的定義上有一個錯誤,它應該是nnz而不是n,但修復不能解決問題。 Thx爲弗拉基米爾頭。我確實會使用fortran90,但我認爲這個問題應該與fortran總體相關。 – wmsmith

回答

3

你用來處理指針推力/ CUSP端代碼是完全不正確的。這:

thrust::device_ptr<int> wrapped_device_I(row_i); 

不會做你認爲它的確如此。你已經有效地完成了將主機地址轉換爲設備地址。除非您使用CUDA託管內存,否則這是非法的,並且在此代碼中看不到任何證據。你想要做的是分配內存並在開始之前將Fortran陣列複製到GPU。這樣做:

thrust::device_ptr<int> wrapped_device_I = thrust::device_malloc<int>(nnz); 
thrust::copy(row_i, row_i + nnz, wrapped_device_I); 

[免責聲明:沒有經過測試的,使用風險自擔]

對於每個COO載體。不過,我會建議將test_coo_mat_print_的GPU設置部分中的大部分代碼替換爲thrust::vector實例。除了易於使用之外,當它們超出範圍時,您可以釋放內存釋放,從而減少工程內存泄漏的可能性。所以像這樣:

thrust::device_vector<int> device_I(row_i, row_i + nnz); 

照顧一切在一個電話。

作爲最後一個提示,如果您正在開發多語言的代碼庫,設計它們使得每種語言的代碼是完全獨立的,有自己的本地測試代碼。如果在這種情況下你已經這樣做了,那麼你會發現C++部分不能獨立於任何你遇到的Fortran問題而工作。它會使調試更簡單。

+0

謝謝!這工作完美。在建立推力device_vector結構後,我能夠完美地構建A.這是我第一次嘗試從設備向量中構建CUSP矩陣,所以我很迷茫。我已經看到的大多數例子都是從矩陣市場文件中加載它們,或者直接初始化這些值。這更清潔。 – wmsmith