2012-12-27 30 views
7

這是我在這裏發佈的第一個問題,所以我希望我不會做任何錯誤。C++ 11現代風格的循環與舊式循環的性能

我的問題涉及現代風格的C++ 11循環(基於範圍的std::for_each)與舊式C++循環(for (...; ...; ...))的性能。根據我的理解,在我看來,現代C++的座右銘是「表現力,對性能毫不妥協」。現代C++風格導致安全,乾淨和快速的代碼,幾乎沒有性能損失,並且性能可能優於舊式C++。

現在我做了一個小測試,以評估這個收益有多大,涉及迴路。首先,我寫了下面的三個功能:

using namespace std; 

void foo(vector<double>& v) 
{ 
    for (size_t i = 0; i < v.size(); i++) 
    { 
     v[i] /= 42; 
    } 
} 

void bar(vector<double>& v) 
{ 
    for (auto& x : v) 
    { 
     x /= 42; 
    } 
} 

void wee(vector<double>& v) 
{ 
    for_each(begin(v), end(v), [] (double& x) 
    { 
     x /= 42; 
    }); 
} 

然後,我稱他們這樣(正確註釋/取消註釋main()的循環中的三條線相比,它們的性能:

vector<double> make_vector() 
{ 
    vector<double> v; 
    for (int i = 0; i < 30000; i++) { v.push_back(i); } 
    return v; 
} 

int main() 
{ 
    time_t start = clock(); 

    auto v = make_vector(); 
    for (int i = 0; i <= 50000; i++) 
    { 
     // UNCOMMENT THE FUNCTION CALL TO BE TESTED, COMMENT THE OTHERS 

     foo(v); 
     // bar(v); 
     // wee(v); 
    } 

    time_t end = clock(); 
    cout << (end - start) << endl; 

    return 0; 
} 

平均超過10處決通過註釋/取消註釋main()的循環中的行,並使用舊式循環作爲基準獲得的每個版本的程序,基於範圍的for循環執行〜1.9倍差,並且基於std::for_each和lambda表達式的循環執行效率降低約2.3倍

我用Clang 3.2來編譯這個,我還沒有嘗試MS VC11(我正在使用WinXP)。

考慮我得到相當的執行時間的期待,我的問題是:

  1. 難道我顯然是錯誤的?
  2. 如果不是這樣,那麼2倍的性能損失是不是一個接受現代風格循環的好理由?

我想一句話,說我不相信,清晰度和編寫的現代C++風格的代碼安全還清了可能的性能損失,但我完全不存在權衡這一說法在一方的清晰度/安全性和另一方的性能之間。

我錯過了什麼嗎?

+4

我只是跑你的代碼(鏗鏘編譯)和所有三個跑在大約相同的時間量。你的編譯器是否啓用了優化? – Mankarse

+0

你的編譯器的clock()的分辨率是多少?你的結果是否在誤差範圍內? – Ferruccio

+0

整數除法通常是大多數體系結構中的高延遲指令,因此您的循環開銷可能不重要 - 請嘗試使用加法。 –

回答

9

看起來只有當您不在編譯器中啓用優化時才顯示差異。

使用Clang,您可以使用-O[0-3]標誌啓用優化。

1

Mankarse是正確的 - 很可能您還沒有啓用優化。

實際上,在Clang中,它們在主循環中產生幾乎相同的結果ASM代碼,並且在前/後代碼中的差異很小。

我已經測試了四個版本:hand_loop_indexhand_loop_iteratorrange_based_forfor_each_algorithm

hand_loop_iteratorrange_based_forfor_each_algorithm - 所有三個都產生了全功能的身體完全相同的結果ASM,唯一的區別是在標籤的名稱。

I.e.用迭代器編寫循環的結果與基於範圍的for和std :: for_each完全相同的ASM代碼。

循環與索引和循環與迭代器版本之間有一些差異。

兩種情況下的主迴路幾乎相同。唯一的不同是迭代器版本rdx使用寄存器而不是rsi

指數版本:

.LBB0_7:        # %vector.body 
             # =>This Inner Loop Header: Depth=1 
    movupd -48(%rsi), %xmm1 
    movupd -32(%rsi), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -48(%rsi) 
    movupd %xmm2, -32(%rsi) 
    movupd -16(%rsi), %xmm1 
    movupd (%rsi), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -16(%rsi) 
    movupd %xmm2, (%rsi) 
    addq $64, %rsi 
    addq $-8, %rdi 
    jne .LBB0_7 

迭代版本(S):

.LBB1_6:        # %vector.body 
             # =>This Inner Loop Header: Depth=1 
    movupd -48(%rdx), %xmm1 
    movupd -32(%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -48(%rdx) 
    movupd %xmm2, -32(%rdx) 
    movupd -16(%rdx), %xmm1 
    movupd (%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -16(%rdx) 
    movupd %xmm2, (%rdx) 
    addq $64, %rdx 
    addq $-8, %rsi 
    jne .LBB1_6 

預索引VS迭代版本/後的代碼有許多差異,但它不應該影響到很大的總成績時機足夠大的陣列。

LIVE DEMO on Coliru with ASM output

#include <algorithm> 
#include <iterator> 
#include <vector> 

using namespace std; 

void hand_loop_index(vector<double> &v) 
{ 
    for (size_t i = 0; i < v.size(); ++i) 
    { 
     v[i] /= 42; 
    } 
} 

void hand_loop_iterator(vector<double> &v) 
{ 
    for (auto first = begin(v), last = end(v); first!=last; ++first) 
    { 
     *first /= 42; 
    } 
} 

void range_based_for(vector<double> &v) 
{ 
    for (auto &x : v) 
    { 
     x /= 42; 
    } 
} 

void for_each_algorithm(vector<double> &v) 
{ 
    for_each(begin(v), end(v), [] (double &x) 
    { 
     x /= 42; 
    }); 
} 

結果ASM:

# clang++ -std=c++1z -O3 -Wall -pedantic -pthread main.cpp -S 
    .text 
    .file "main.cpp" 
    .section .rodata.cst16,"aM",@progbits,16 
    .align 16 
.LCPI0_0: 
    .quad 4631107791820423168  # double 4.200000e+01 
    .quad 4631107791820423168  # double 4.200000e+01 
    .section .rodata.cst8,"aM",@progbits,8 
    .align 8 
.LCPI0_1: 
    .quad 4631107791820423168  # double 42 
    .text 
    .globl _Z15hand_loop_indexRSt6vectorIdSaIdEE 
    .align 16, 0x90 
    .type _Z15hand_loop_indexRSt6vectorIdSaIdEE,@function 
_Z15hand_loop_indexRSt6vectorIdSaIdEE: # @_Z15hand_loop_indexRSt6vectorIdSaIdEE 
    .cfi_startproc 
# BB#0: 
    movq (%rdi), %rax 
    movq 8(%rdi), %rcx 
    subq %rax, %rcx 
    je .LBB0_11 
# BB#1:         # %.lr.ph 
    sarq $3, %rcx 
    cmpq $1, %rcx 
    movl $1, %edx 
    cmovaq %rcx, %rdx 
    xorl %edi, %edi 
    testq %rdx, %rdx 
    je .LBB0_10 
# BB#2:         # %overflow.checked 
    xorl %edi, %edi 
    movq %rdx, %r8 
    andq $-4, %r8 
    je .LBB0_9 
# BB#3:         # %vector.body.preheader 
    cmpq $1, %rcx 
    movl $1, %edi 
    cmovaq %rcx, %rdi 
    addq $-4, %rdi 
    movq %rdi, %rsi 
    shrq $2, %rsi 
    xorl %r9d, %r9d 
    btq $2, %rdi 
    jb .LBB0_5 
# BB#4:         # %vector.body.prol 
    movupd (%rax), %xmm0 
    movupd 16(%rax), %xmm1 
    movapd .LCPI0_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01] 
    divpd %xmm2, %xmm0 
    divpd %xmm2, %xmm1 
    movupd %xmm0, (%rax) 
    movupd %xmm1, 16(%rax) 
    movl $4, %r9d 
.LBB0_5:        # %vector.body.preheader.split 
    testq %rsi, %rsi 
    je .LBB0_8 
# BB#6:         # %vector.body.preheader.split.split 
    cmpq $1, %rcx 
    movl $1, %edi 
    cmovaq %rcx, %rdi 
    andq $-4, %rdi 
    subq %r9, %rdi 
    leaq 48(%rax,%r9,8), %rsi 
    movapd .LCPI0_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01] 
    .align 16, 0x90 
.LBB0_7:        # %vector.body 
             # =>This Inner Loop Header: Depth=1 
    movupd -48(%rsi), %xmm1 
    movupd -32(%rsi), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -48(%rsi) 
    movupd %xmm2, -32(%rsi) 
    movupd -16(%rsi), %xmm1 
    movupd (%rsi), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -16(%rsi) 
    movupd %xmm2, (%rsi) 
    addq $64, %rsi 
    addq $-8, %rdi 
    jne .LBB0_7 
.LBB0_8: 
    movq %r8, %rdi 
.LBB0_9:        # %middle.block 
    cmpq %rdi, %rdx 
    je .LBB0_11 
    .align 16, 0x90 
.LBB0_10:        # %scalar.ph 
             # =>This Inner Loop Header: Depth=1 
    movsd (%rax,%rdi,8), %xmm0 # xmm0 = mem[0],zero 
    divsd .LCPI0_1(%rip), %xmm0 
    movsd %xmm0, (%rax,%rdi,8) 
    incq %rdi 
    cmpq %rcx, %rdi 
    jb .LBB0_10 
.LBB0_11:        # %._crit_edge 
    retq 
.Lfunc_end0: 
    .size _Z15hand_loop_indexRSt6vectorIdSaIdEE, .Lfunc_end0-_Z15hand_loop_indexRSt6vectorIdSaIdEE 
    .cfi_endproc 

.section .rodata.cst16,"aM",@progbits,16 
    .align 16 
.LCPI1_0: 
    .quad 4631107791820423168  # double 4.200000e+01 
    .quad 4631107791820423168  # double 4.200000e+01 
    .section .rodata.cst8,"aM",@progbits,8 
    .align 8 
.LCPI1_1: 
    .quad 4631107791820423168  # double 42 
    .text 
    .globl _Z18hand_loop_iteratorRSt6vectorIdSaIdEE 
    .align 16, 0x90 
    .type _Z18hand_loop_iteratorRSt6vectorIdSaIdEE,@function 
_Z18hand_loop_iteratorRSt6vectorIdSaIdEE: # @_Z18hand_loop_iteratorRSt6vectorIdSaIdEE 
    .cfi_startproc 
# BB#0: 
    movq (%rdi), %rdx 
    movq 8(%rdi), %rax 
    cmpq %rax, %rdx 
    je .LBB1_11 
# BB#1:         # %.lr.ph.preheader 
    movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC 
    leaq -8(%rax), %rcx 
    subq %rdx, %rcx 
    shrq $3, %rcx 
    incq %rcx 
    xorl %edi, %edi 
    movq %rcx, %r9 
    andq %rsi, %r9 
    je .LBB1_8 
# BB#2:         # %vector.body.preheader 
    andq %rcx, %rsi 
    leaq -4(%rsi), %rdi 
    movq %rdi, %r11 
    shrq $2, %r11 
    xorl %r10d, %r10d 
    btq $2, %rdi 
    jb .LBB1_4 
# BB#3:         # %vector.body.prol 
    movupd (%rdx), %xmm0 
    movupd 16(%rdx), %xmm1 
    movapd .LCPI1_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01] 
    divpd %xmm2, %xmm0 
    divpd %xmm2, %xmm1 
    movupd %xmm0, (%rdx) 
    movupd %xmm1, 16(%rdx) 
    movl $4, %r10d 
.LBB1_4:        # %vector.body.preheader.split 
    leaq (%rdx,%r9,8), %r8 
    testq %r11, %r11 
    je .LBB1_7 
# BB#5:         # %vector.body.preheader.split.split 
    subq %r10, %rsi 
    leaq 48(%rdx,%r10,8), %rdx 
    movapd .LCPI1_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01] 
    .align 16, 0x90 
.LBB1_6:        # %vector.body 
             # =>This Inner Loop Header: Depth=1 
    movupd -48(%rdx), %xmm1 
    movupd -32(%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -48(%rdx) 
    movupd %xmm2, -32(%rdx) 
    movupd -16(%rdx), %xmm1 
    movupd (%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -16(%rdx) 
    movupd %xmm2, (%rdx) 
    addq $64, %rdx 
    addq $-8, %rsi 
    jne .LBB1_6 
.LBB1_7: 
    movq %r8, %rdx 
    movq %r9, %rdi 
.LBB1_8:        # %middle.block 
    cmpq %rdi, %rcx 
    je .LBB1_11 
# BB#9: 
    movsd .LCPI1_1(%rip), %xmm0 # xmm0 = mem[0],zero 
    .align 16, 0x90 
.LBB1_10:        # %.lr.ph 
             # =>This Inner Loop Header: Depth=1 
    movsd (%rdx), %xmm1   # xmm1 = mem[0],zero 
    divsd %xmm0, %xmm1 
    movsd %xmm1, (%rdx) 
    addq $8, %rdx 
    cmpq %rdx, %rax 
    jne .LBB1_10 
.LBB1_11:        # %._crit_edge 
    retq 
.Lfunc_end1: 
    .size _Z18hand_loop_iteratorRSt6vectorIdSaIdEE, .Lfunc_end1-_Z18hand_loop_iteratorRSt6vectorIdSaIdEE 
    .cfi_endproc 

.section .rodata.cst16,"aM",@progbits,16 
    .align 16 
.LCPI2_0: 
    .quad 4631107791820423168  # double 4.200000e+01 
    .quad 4631107791820423168  # double 4.200000e+01 
    .section .rodata.cst8,"aM",@progbits,8 
    .align 8 
.LCPI2_1: 
    .quad 4631107791820423168  # double 42 
    .text 
    .globl _Z15range_based_forRSt6vectorIdSaIdEE 
    .align 16, 0x90 
    .type _Z15range_based_forRSt6vectorIdSaIdEE,@function 
_Z15range_based_forRSt6vectorIdSaIdEE: # @_Z15range_based_forRSt6vectorIdSaIdEE 
    .cfi_startproc 
# BB#0: 
    movq (%rdi), %rdx 
    movq 8(%rdi), %rax 
    cmpq %rax, %rdx 
    je .LBB2_11 
# BB#1:         # %.lr.ph.preheader 
    movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC 
    leaq -8(%rax), %rcx 
    subq %rdx, %rcx 
    shrq $3, %rcx 
    incq %rcx 
    xorl %edi, %edi 
    movq %rcx, %r9 
    andq %rsi, %r9 
    je .LBB2_8 
# BB#2:         # %vector.body.preheader 
    andq %rcx, %rsi 
    leaq -4(%rsi), %rdi 
    movq %rdi, %r11 
    shrq $2, %r11 
    xorl %r10d, %r10d 
    btq $2, %rdi 
    jb .LBB2_4 
# BB#3:         # %vector.body.prol 
    movupd (%rdx), %xmm0 
    movupd 16(%rdx), %xmm1 
    movapd .LCPI2_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01] 
    divpd %xmm2, %xmm0 
    divpd %xmm2, %xmm1 
    movupd %xmm0, (%rdx) 
    movupd %xmm1, 16(%rdx) 
    movl $4, %r10d 
.LBB2_4:        # %vector.body.preheader.split 
    leaq (%rdx,%r9,8), %r8 
    testq %r11, %r11 
    je .LBB2_7 
# BB#5:         # %vector.body.preheader.split.split 
    subq %r10, %rsi 
    leaq 48(%rdx,%r10,8), %rdx 
    movapd .LCPI2_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01] 
    .align 16, 0x90 
.LBB2_6:        # %vector.body 
             # =>This Inner Loop Header: Depth=1 
    movupd -48(%rdx), %xmm1 
    movupd -32(%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -48(%rdx) 
    movupd %xmm2, -32(%rdx) 
    movupd -16(%rdx), %xmm1 
    movupd (%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -16(%rdx) 
    movupd %xmm2, (%rdx) 
    addq $64, %rdx 
    addq $-8, %rsi 
    jne .LBB2_6 
.LBB2_7: 
    movq %r8, %rdx 
    movq %r9, %rdi 
.LBB2_8:        # %middle.block 
    cmpq %rdi, %rcx 
    je .LBB2_11 
# BB#9: 
    movsd .LCPI2_1(%rip), %xmm0 # xmm0 = mem[0],zero 
    .align 16, 0x90 
.LBB2_10:        # %.lr.ph 
             # =>This Inner Loop Header: Depth=1 
    movsd (%rdx), %xmm1   # xmm1 = mem[0],zero 
    divsd %xmm0, %xmm1 
    movsd %xmm1, (%rdx) 
    addq $8, %rdx 
    cmpq %rdx, %rax 
    jne .LBB2_10 
.LBB2_11:        # %._crit_edge 
    retq 
.Lfunc_end2: 
    .size _Z15range_based_forRSt6vectorIdSaIdEE, .Lfunc_end2-_Z15range_based_forRSt6vectorIdSaIdEE 
    .cfi_endproc 

.section .rodata.cst16,"aM",@progbits,16 
    .align 16 
.LCPI3_0: 
    .quad 4631107791820423168  # double 4.200000e+01 
    .quad 4631107791820423168  # double 4.200000e+01 
    .section .rodata.cst8,"aM",@progbits,8 
    .align 8 
.LCPI3_1: 
    .quad 4631107791820423168  # double 42 
    .text 
    .globl _Z18for_each_algorithmRSt6vectorIdSaIdEE 
    .align 16, 0x90 
    .type _Z18for_each_algorithmRSt6vectorIdSaIdEE,@function 
_Z18for_each_algorithmRSt6vectorIdSaIdEE: # @_Z18for_each_algorithmRSt6vectorIdSaIdEE 
    .cfi_startproc 
# BB#0: 
    movq (%rdi), %rdx 
    movq 8(%rdi), %rax 
    cmpq %rax, %rdx 
    je .LBB3_11 
# BB#1:         # %.lr.ph.i.preheader 
    movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC 
    leaq -8(%rax), %rcx 
    subq %rdx, %rcx 
    shrq $3, %rcx 
    incq %rcx 
    xorl %edi, %edi 
    movq %rcx, %r9 
    andq %rsi, %r9 
    je .LBB3_8 
# BB#2:         # %vector.body.preheader 
    andq %rcx, %rsi 
    leaq -4(%rsi), %rdi 
    movq %rdi, %r11 
    shrq $2, %r11 
    xorl %r10d, %r10d 
    btq $2, %rdi 
    jb .LBB3_4 
# BB#3:         # %vector.body.prol 
    movupd (%rdx), %xmm0 
    movupd 16(%rdx), %xmm1 
    movapd .LCPI3_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01] 
    divpd %xmm2, %xmm0 
    divpd %xmm2, %xmm1 
    movupd %xmm0, (%rdx) 
    movupd %xmm1, 16(%rdx) 
    movl $4, %r10d 
.LBB3_4:        # %vector.body.preheader.split 
    leaq (%rdx,%r9,8), %r8 
    testq %r11, %r11 
    je .LBB3_7 
# BB#5:         # %vector.body.preheader.split.split 
    subq %r10, %rsi 
    leaq 48(%rdx,%r10,8), %rdx 
    movapd .LCPI3_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01] 
    .align 16, 0x90 
.LBB3_6:        # %vector.body 
             # =>This Inner Loop Header: Depth=1 
    movupd -48(%rdx), %xmm1 
    movupd -32(%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -48(%rdx) 
    movupd %xmm2, -32(%rdx) 
    movupd -16(%rdx), %xmm1 
    movupd (%rdx), %xmm2 
    divpd %xmm0, %xmm1 
    divpd %xmm0, %xmm2 
    movupd %xmm1, -16(%rdx) 
    movupd %xmm2, (%rdx) 
    addq $64, %rdx 
    addq $-8, %rsi 
    jne .LBB3_6 
.LBB3_7: 
    movq %r8, %rdx 
    movq %r9, %rdi 
.LBB3_8:        # %middle.block 
    cmpq %rdi, %rcx 
    je .LBB3_11 
# BB#9: 
    movsd .LCPI3_1(%rip), %xmm0 # xmm0 = mem[0],zero 
    .align 16, 0x90 
.LBB3_10:        # %.lr.ph.i 
             # =>This Inner Loop Header: Depth=1 
    movsd (%rdx), %xmm1   # xmm1 = mem[0],zero 
    divsd %xmm0, %xmm1 
    movsd %xmm1, (%rdx) 
    addq $8, %rdx 
    cmpq %rdx, %rax 
    jne .LBB3_10 
.LBB3_11:        # %_ZSt8for_eachIN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEZ18for_each_algorithmR5_E3$_0ET0_T_SA_S9_.exit 
    retq 
.Lfunc_end3: 
    .size _Z18for_each_algorithmRSt6vectorIdSaIdEE, .Lfunc_end3-_Z18for_each_algorithmRSt6vectorIdSaIdEE 
    .cfi_endproc 

    .ident "clang version 3.7.0 (tags/RELEASE_370/final 246979)" 
    .section ".note.GNU-stack","",@progbits