我建議你不要重新發明輪子,而是使用爲此提供的cublas fortran bindings。
「thunking」包裝不是你想要的。它可以根據需要進行隱式複製操作,任何時候您在fortran中使用Cublas調用。
你想要「非thunk」包裝,所以你有明確的控制複製進行。您可以使用等值的Get/SetMatrix
和Get/SetVector
來回複製數據。
有一個示例代碼(示例B.2),其中顯示瞭如何使用cublas文檔中包含的非thunking包裝器。
即使您確實想重新發明輪子,包裝程序也會向您展示如何在C和Fortran之間移動必要的語法。
在一個標準的Linux安裝CUDA,該包裝是/usr/local/cuda/src
非置信轉換包裝是/usr/local/cuda/src/fortran.c
這裏是一個完全樣例:
cublasf。F:
program cublas_fortran_example
implicit none
integer i, j
c helper functions
integer cublas_init
integer cublas_shutdown
integer cublas_alloc
integer cublas_free
integer cublas_set_vector
integer cublas_get_vector
c selected blas functions
double precision cublas_ddot
external cublas_daxpy
external cublas_dscal
external cublas_dcopy
double precision cublas_dnrm2
c cublas variables
integer cublas_status
real*8 x(30), y(30)
double precision alpha, beta
double precision nrm
integer*8 d_x, d_y, d_alpha, d_beta, d_nrm
integer*8 dsize1, dlength1, dlength2
double precision dresult
write(*,*) "testing cublas fortran example"
c initialize cublas library
c CUBLAS_STATUS_SUCCESS=0
cublas_status = cublas_init()
if (cublas_status /= 0) then
write(*,*) "CUBLAS Library initialization failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
c initialize data
do j=1,30
x(j) = 1.0
y(j) = 2.0
enddo
dsize1 = 8
dlength1 = 30
dlength2 = 1
alpha = 2.0
beta = 3.0
c allocate device storage
cublas_status = cublas_alloc(dlength1, dsize1, d_x)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength1, dsize1, d_y)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength2, dsize1, d_alpha)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength2, dsize1, d_beta)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
cublas_status = cublas_alloc(dlength2, dsize1, d_nrm)
if (cublas_status /= 0) then
write(*,*) "CUBLAS device malloc failed"
stop
endif
c copy data from host to device
cublas_status = cublas_set_vector(dlength1, dsize1, x, dlength2,
> d_x, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to device failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
cublas_status = cublas_set_vector(dlength1, dsize1, y, dlength2,
> d_y, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to device failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
dresult = cublas_ddot(dlength1, d_x, dlength2, d_y, dlength2)
write(*,*) "dot product result=",dresult
dresult = cublas_dnrm2(dlength1, d_x, dlength2)
write(*,*) "nrm2 of x result=",dresult
dresult = cublas_dnrm2(dlength1, d_y, dlength2)
write(*,*) "nrm2 of y result=",dresult
call cublas_daxpy(dlength1, alpha, d_x, dlength2, d_y, dlength2)
cublas_status = cublas_get_vector(dlength1, dsize1, d_y, dlength2,
> y, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to host failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
write(*,*) "daxpy y(1) =", y(1)
write(*,*) "daxpy y(30) =", y(30)
call cublas_dscal(dlength1, beta, d_x, dlength2)
cublas_status = cublas_get_vector(dlength1, dsize1, d_x, dlength2,
> x, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to host failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
write(*,*) "dscal x(1) =", x(1)
write(*,*) "dscal x(30) =", x(30)
call cublas_dcopy(dlength1, d_x, dlength2, d_y, dlength2)
cublas_status = cublas_get_vector(dlength1, dsize1, d_y, dlength2,
> y, dlength2)
if (cublas_status /= 0) then
write(*,*) "CUBLAS copy to host failed"
write(*,*) "cublas_status=",cublas_status
stop
endif
write(*,*) "dcopy y(1) =", y(1)
write(*,*) "dcopy y(30) =", y(30)
c deallocate GPU memory and exit
cublas_status = cublas_free(d_x)
cublas_status = cublas_free(d_y)
cublas_status = cublas_free(d_alpha)
cublas_status = cublas_free(d_beta)
cublas_status = cublas_free(d_nrm)
cublas_status = cublas_shutdown()
stop
end
編譯/運行:
$ gfortran -c -o cublasf.o cublasf.f
$ gcc -c -DCUBLAS_GFORTRAN -I/usr/local/cuda/include -I/usr/local/cuda/src -o fortran.o /usr/local/cuda/src/fortran.c
$ gfortran -L/usr/local/cuda/lib64 -lcublas -o cublasf cublasf.o fortran.o
$ ./cublasf
testing cublas fortran example
dot product result= 60.0000000000000
nrm2 of x result= 5.47722557505166
nrm2 of y result= 10.9544511501033
daxpy y(1) = 4.00000000000000
daxpy y(30) = 4.00000000000000
dscal x(1) = 3.00000000000000
dscal x(30) = 3.00000000000000
dcopy y(1) = 3.00000000000000
dcopy y(30) = 3.00000000000000
$
CUDA 5.0,RHEL 5.5
謝謝你的非常快,非常有幫助的迴應。我只是有幾個跟進問題。 1)我正在運行Ubuntu 13.10並需要使用英特爾Fortran(ifort)編譯器。當我運行上面的代碼時,使用gcc和gfortran,我得到各種錯誤: fortran.o:函數'cublas_init_': fortran.c :(.text + 0x5):未定義引用'cublasInit ' 這些是從哪裏來的(我從上面複製並粘貼你的例子)?另外,有沒有辦法使用這個ifort?我一直從fortran_common.h得到並未定義編譯器錯誤。 – Dylan
您正在使用哪種版本的CUDA?你的cuda庫在哪裏安裝?這聽起來像你的鏈接操作沒有找到cublas庫。對於'ifort',將'-DCUBLAS_GFORTRAN'改爲'-DCUBLAS_INTEL_FORTRAN'也可能需要其他更改。它應該可以與'ifort'一起使用,但是可能會需要一些語法更改。 –
我使用的是Cuda 5.5,它們安裝在/usr/local/cuda-5.5/中,我在上面的示例中進行了相應的更改。我會嘗試-DCUBLAS_INTEL_FORTRAN。再次感謝! – Dylan