2014-06-20 37 views
2

我希望像這樣的答案不用擔心,編譯器會照顧那個但我無法確定。Fortran的提領類屬性的速度

當我做一些方法在一些自定義的類型/類FORTRAN,有因引用/取消引用像this%a(i) = this%b(i) + this%c(i)對象的字段比較擊中任何性能只使用數組一樣a(i) = b(i) + c(i)

更復雜例如:

例如我有這個函數應該在3D網格上內插一個真正性能至關重要的值(它會在另一個3D數組的三重循環內被調用)。所以我在考慮如果使用類的方法更好(性能),或者更確切地說,創建一個將該數組作爲參數的普通子例程。

type grid3D           ! 3D grid maps of observables 
    real, dimension (3) :: Rmin, Rmax, Rspan, step ! grid size and spacing (x,y,z) 
    integer, dimension (3) :: N      ! dimension in x,y,z 
    real, dimension (3,:, :, :), allocatable :: f  ! array storing values of othe observable 
    contains 
    procedure :: interpolate => grid3D_interpolate 
end type grid3D 

function grid3D_interpolate(this, R) result(ff) 
implicit none 
    ! variables 
    class (grid3D) :: this 
    real, dimension (3), intent (in) :: R 
    real :: ff 
    integer ix0,iy0,iz0 
    integer ix1,iy1,iz1 
    real dx,dy,dz 
    real mx,my,mz 
    ! function body 
    ix0 = int((R(1)/this%step(1)) + fastFloorOffset) - fastFloorOffset 
    iy0 = int((R(2)/this%step(2)) + fastFloorOffset) - fastFloorOffset 
    iz0 = int((R(3)/this%step(3)) + fastFloorOffset) - fastFloorOffset 
    dx = R(1) - x0*this%step(1) 
    dy = R(2) - y0*this%step(2) 
    dz = R(3) - z0*this%step(3) 
    ix0 = modulo(x0 , this%N(1))+1 
    iy0 = modulo(y0 , this%N(2))+1 
    iz0 = modulo(z0 , this%N(3))+1 
    ix1 = modulo(x0+1 , this%N(1))+1 
    iy1 = modulo(y0+1 , this%N(2))+1 
    iz1 = modulo(z0+1 , this%N(3))+1 
    mx=1.0-dx 
    my=1.0-dy 
    mz=1.0-dz 
    ff = mz*(my*(mx*this%f(ix0,iy0,iz0)  & 
       +dx*this%f(ix1,iy0,iz0)) & 
      +dy*(mx*this%f(ix0,iy1,iz0)  & 
       +dx*this%f(ix1,iy1,iz0))) & 
     +dz*(my*(mx*this%f(ix0,iy0,iz1)  & 
       +dx*this%f(ix1,iy0,iz1)) & 
      +dy*(mx*this%f(ix0,iy1,iz1)  & 
       +dx*this%f(ix1,iy1,iz1))) 
    end if 
end function grid3D_interpolate 

end module T_grid3Dvec 
+4

如果你關心你爲什麼不做出一些實驗,收集一些數據,這些問題?如果你想確定得到一些證據,不要依賴陌生人的斷言。 –

+0

部分你是對的,但通過問我可以得到一些更廣泛的理解或建議,而不僅僅是觀察。 –

+0

我懷疑可能有區別,你只是取消引用一個指針。 –

回答

2

不是。

  • 只要你的代碼結構非常清楚(對編譯器),它可以很容易地優化它。
  • 一旦您的OOP結構變得太複雜,或者解引用的級別變得太大,您可能會從手動解引用方案中獲得一些改進。 (我使用了很多,儘管通常我的代碼是可讀的,但我在這裏稍微改進了一次,但是代碼使用了> 5級的解引用。)

這是一些示例:

module vec_mod 
    implicit none 

    type t_vector 
    real :: x = 0. 
    real :: y = 0. 
    real :: z = 0. 
    end type 

    type t_group 
    type(t_vector),allocatable :: vecs(:) 
    end type 

contains 

    subroutine sum_vec(vecs, res) 
    implicit none 
    type(t_vector),intent(in) :: vecs(:) 
    type(t_vector),intent(out) :: res 
    integer      :: i 

    res%x = 0. ; res%y = 0. ; res%z = 0. 

    do i=1,size(vecs) 
     res%x = res%x + vecs(i)%x 
     res%y = res%y + vecs(i)%y 
     res%z = res%z + vecs(i)%z 
    enddo 
    end subroutine 

    subroutine sum_vec_ptr(vecs, res) 
    implicit none 
    type(t_vector),intent(in),target :: vecs(:) 
    type(t_vector),intent(out)   :: res 
    integer       :: i 
    type(t_vector),pointer    :: curVec 

    res%x = 0. ; res%y = 0. ; res%z = 0. 

    do i=1,size(vecs) 
     curVec => vecs(i) 
     res%x = res%x + curVec%x 
     res%y = res%y + curVec%y 
     res%z = res%z + curVec%z 
    enddo 
    end subroutine 

    subroutine sum_vecGrp(vecGrp, res) 
    implicit none 
    type(t_group),intent(in) :: vecGrp 
    type(t_vector),intent(out) :: res 
    integer      :: i 

    res%x = 0. ; res%y = 0. ; res%z = 0. 

    do i=1,size(vecGrp%vecs) 
     res%x = res%x + vecGrp%vecs(i)%x 
     res%y = res%y + vecGrp%vecs(i)%y 
     res%z = res%z + vecGrp%vecs(i)%z 
    enddo 
    end subroutine 

    subroutine sum_vecGrp_ptr(vecGrp, res) 
    implicit none 
    type(t_group),intent(in),target :: vecGrp 
    type(t_vector),intent(out)   :: res 
    integer       :: i 
    type(t_vector),pointer    :: curVec, vecs(:) 

    res%x = 0. ; res%y = 0. ; res%z = 0. 

    vecs => vecGrp%vecs 
    do i=1,size(vecs) 
     curVec => vecs(i) 
     res%x = res%x + curVec%x 
     res%y = res%y + curVec%y 
     res%z = res%z + curVec%z 
    enddo 
    end subroutine 
end module 

program test 
    use omp_lib 
    use vec_mod 
    use,intrinsic :: ISO_Fortran_env 
    implicit none 
    type(t_vector),allocatable :: vecs(:) 
    type(t_vector)    :: res 
    type(t_group)    :: vecGrp 
    integer,parameter   :: N=100000000 
    integer     :: i, stat 
    real(REAL64)    :: t1, t2 

    allocate(vecs(N), vecGrp%vecs(N), stat=stat) 
    if (stat /= 0) stop 'Cannot allocate memory' 

    do i=1,N 
    call random_number(vecs(i)%x) 
    call random_number(vecs(i)%y) 
    call random_number(vecs(i)%z) 
    enddo 

    print *,'' 
    print *,'1 Level' 
    t1 = omp_get_wtime() 
    call sum_vec(vecs, res) 
    print *,res 
    t2 = omp_get_wtime() 
    print *,'Normal [s]:', t2-t1 

    t1 = omp_get_wtime() 
    call sum_vec_ptr(vecs, res) 
    print *,res 
    t2 = omp_get_wtime() 
    print *,'Pointer [s]:', t2-t1 

    print *,'' 
    print *,'2 Levels' 
    vecGrp%vecs = vecs 

    t1 = omp_get_wtime() 
    call sum_vecGrp(vecGrp, res) 
    print *,res 
    t2 = omp_get_wtime() 
    print *,'Normal [s]:', t2-t1 

    t1 = omp_get_wtime() 
    call sum_vecGrp_ptr(vecGrp, res) 
    print *,res 
    t2 = omp_get_wtime() 
    print *,'Pointer [s]:', t2-t1 

end program 

使用默認選項(gfortran test.F90 -fopenmp)編譯,三是從手動解引用輕微的好處,特別是對兩級提領的:

OMP_NUM_THREADS=1 ./a.out 

1 Level 
    16777216.0  16777216.0  16777216.0  
Normal [s]: 0.69216769299237058  
    16777216.0  16777216.0  16777216.0  
Pointer [s]: 0.67321390099823475  

2 Levels 
    16777216.0  16777216.0  16777216.0  
Normal [s]: 0.84902219301147852  
    16777216.0  16777216.0  16777216.0  
Pointer [s]: 0.71247501399193425 

一旦你打開優化(gfortran test.F90 -fopenmp -O3),你可以看到,編譯器實際上自動執行一個更好的工作:

OMP_NUM_THREADS=1 ./a.out 

1 Level 
    16777216.0  16777216.0  16777216.0  
Normal [s]: 0.13888958499592263  
    16777216.0  16777216.0  16777216.0  
Pointer [s]: 0.19099253200693056  

2 Levels 
    16777216.0  16777216.0  16777216.0  
Normal [s]: 0.13436777899914887  
    16777216.0  16777216.0  16777216.0  
Pointer [s]: 0.21104205500159878 
+0

哦,interestion。你有沒有想過爲什麼在使用-O3優化標誌時,「指針」版本比「正常」要慢? –

+0

只是猜測:(a)循環包含很少的操作,所以關聯可能會有更強的影響(b)也許編譯器不能使用指針來執行所有的優化,它可以處理常規變量(c)也許'target'指定符抑制一些優化。很難說沒有考慮優化實際上如何執行...... –