流內在降低性能

我正在玩_mm_stream_ps內在，我在理解其性能方面遇到了一些麻煩。流內在降低性能

這裏是我工作的代碼片段... 流版本：

#include <stdio.h> 
#include <stdint.h> 
#include <stdlib.h> 
#include <omp.h> 

#include <immintrin.h> 

#define NUM_ELEMENTS 10000000L 

static void copy_temporal(float* restrict x, float* restrict y) 
{ 
    for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){ 
     _mm_store_ps(y,_mm_load_ps(x)); 
     _mm_store_ps(y+4,_mm_load_ps(x+4)); 
     x+=8; 
     y+=8; 
    } 
} 
static void copy_nontemporal(float* restrict x, float* restrict y) 
{ 
    for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){ 
     _mm_stream_ps(y,_mm_load_ps(x)); 
     _mm_stream_ps(y+4,_mm_load_ps(x+4)); 
     x+=8; 
     y+=8; 
    } 
} 

int main(int argc, char** argv) 
{ 
    uint64_t sizeX = sizeof(float) * 4 * NUM_ELEMENTS; 
    float *x = (float*) _mm_malloc(sizeX,32); 
    float *y = (float*) _mm_malloc(sizeX,32); 

    //initialization 
    for(uint64_t i = 0 ; i < 4 * NUM_ELEMENTS; ++i){ 
     x[i] = (float)rand()/RAND_MAX; 
     y[i] = 0; 
    } 

    printf("%g MB allocated\n",(2 * sizeX)/1024.0/1024.0); 

    double start = omp_get_wtime(); 
    copy_nontemporal(x, y); 
    double time = omp_get_wtime() - start; 
    printf("Bandwidth (non-temporal): %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time); 

    start = omp_get_wtime(); 
    copy_temporal(x, y); 
    time = omp_get_wtime() - start; 
    printf("Bandwidth: %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time); 

    _mm_free(x); 
    _mm_free(y); 

    return 0; 
}

性能測試結果：

2.3 GHz Core i7 (I7-3615QM) (Laptop): 
    305.176 MB allocated 
    Bandwidth (non-temporal): 24.2242 GB/s 
    Bandwidth: 21.4136 GB/s 

Xeon(R) CPU E5-2650 0 @ 2.00GHz (cluster (exclusive job)): 
    305.176 MB allocated 
    Bandwidth (non-temporal): 8.33133 GB/s 
    Bandwidth: 8.20684 GB/s

真正困擾我的是，我看到更好的性能 - - 在Xeon CPU上（不在我的筆記本電腦上） - 如果我使用非對齊的加載和存儲（即storeu_ps/loadu_ps）：

305.176 MB allocated 
Bandwidth (non-temporal): 8.30105 GB/s 
Bandwidth: 12.7056 GB/s

由於y的冗餘負載，我期望流版本比非流版本更快。但是，測量結果顯示，流版本實際上比非流版本慢兩倍。

你對此有任何解釋嗎？使用編譯器：Intel 14.0.1;使用編譯器：Intel 14.0.2;使用編譯器：Intel 14.0.1;使用編譯器：Intel 14.0.1;使用編譯器：Intel 14.0.1; 編譯器標誌：-O3 -restrict -xAVX; 使用的CPU：Intel Xeon E5-2650;

謝謝。

來源

2014-01-18 user1829358

沒有必要展開循環。循環展開只在依賴關係鏈中有用，並且沒有依賴關係鏈。 CPU可以幫你處理這個問題。但我有個問題。在你的帶寬計算中，3是什麼因素？ –

兩次讀取+一次寫入。即使非時間版本只做一次閱讀，我仍然保留三個因子來簡化比較。 – user1829358

正如ScottD指出的那樣，問題的答案在於生成的彙編代碼。顯然，英特爾編譯器足夠智能，可以檢測訪問模式，並且即使對於時間版本也會自動生成非時間負載。

這裏是一個對時間版本編譯器生成的彙編代碼：

..___tag_value___Z13copy_temporalPfS_.35:      # 
     xor  edx, edx          #22.4 
     xor  eax, eax          # 
..B2.2:       # Preds ..B2.2 ..B2.1 
     vmovups xmm0, XMMWORD PTR [rax+rdi]     #23.34 
     inc  rdx           #22.4 
     vmovntps XMMWORD PTR [rax+rsi], xmm0     #23.20 
     vmovups xmm1, XMMWORD PTR [16+rax+rdi]    #24.36 
     vmovntps XMMWORD PTR [16+rax+rsi], xmm1    #24.20 
     vmovups xmm2, XMMWORD PTR [32+rax+rdi]    #23.34 
     vmovntps XMMWORD PTR [32+rax+rsi], xmm2    #23.20 
     vmovups xmm3, XMMWORD PTR [48+rax+rdi]    #24.36 
     vmovntps XMMWORD PTR [48+rax+rsi], xmm3    #24.20 
     add  rax, 64          #22.4 
     cmp  rdx, 5000000         #22.4 
     jb  ..B2.2  # Prob 99%      #22.4

這仍是以下問題：

爲什麼不匹配，時間版本的性能比好CPU E5-2650的非暫時版本（請參閱上文）。我已經查看了生成的彙編代碼，編譯器確實生成了vmovups指令（由於不存在對齊）。

來源

2014-01-20 23:32:53 user1829358

如果ICC做了一些與你所說的不同的東西，那是令人失望的。我更喜歡它是否實現了你想要它們的內在方式。 –

流變化會直接向DRAM創建流水線突發寫入。速度應該大致與DRAM的速度相匹配。標準存儲區寫入緩存（但是，如果數據不在緩存中，則首先將其讀入緩存）。如果數據已經在緩存中，則標準存儲以緩存寫入的速度運行。一般來說，使用流方法的寫入速度要比上一級緩存大得多。使用標準商店的小寫入速度通常更快。嘗試使用幾GB的緩衝區大小運行測試。流方法應該更快。

這裏是展示一個風向標：從英特爾Core i7-2600K

#define __USE_MINGW_ANSI_STDIO 1 
#include <stdlib.h> 
#include <intrin.h> 
#include <windows.h> 
#include <stdio.h> 
#include <stdint.h> 

//----------------------------------------------------------------------------- 
// 
// queryPerformanceCounter - similar to QueryPerformanceCounter, but returns 
//       count directly. 

uint64_t queryPerformanceCounter (void) 
    { 
    LARGE_INTEGER int64; 
    QueryPerformanceCounter (&int64); 
    return int64.QuadPart; 
    } 

//----------------------------------------------------------------------------- 
// 
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns count direcly. 

uint64_t queryPerformanceFrequency (void) 
    { 
    LARGE_INTEGER int64; 

    QueryPerformanceFrequency (&int64); 
    return int64.QuadPart; 
    } 

//--------------------------------------------------------------------------- 

static void testNontemporal (float *x, float *y, uint64_t numberOfVectors) 
    { 
    uint64_t i; 
    for(i = 0; i < numberOfVectors/2; ++i) 
     { 
     _mm_stream_ps(y,_mm_load_ps(x)); 
     _mm_stream_ps(y+4,_mm_load_ps(x+4)); 
     y+=8; x+=8; 
     } 
    } 

//--------------------------------------------------------------------------- 

static void testTemporal (float *x, float *y, uint64_t numberOfVectors) 
    { 
    uint64_t i; 
    for(i = 0; i < numberOfVectors/2; ++i) 
     { 
     _mm_store_ps(y,_mm_load_ps(x)); 
     _mm_store_ps(y+4,_mm_load_ps(x+4)); 
     y+=8; x+=8; 
     } 
    } 

//--------------------------------------------------------------------------- 

static void runtests (int nonTemporal) 
    { 
    uint64_t startCount, elapsed, index; 
    float *x, *y; 
    uint64_t numberOfBytes = 400 * 0x100000ull; 
    uint64_t numberOfFloats = numberOfBytes/sizeof *x; 
    uint64_t numberOfVectors = numberOfFloats/4; 
    double gbPerSecond; 

    x = _mm_malloc (numberOfBytes, 32); 
    y = _mm_malloc (numberOfBytes, 32); 
    if (x == NULL || y == NULL) exit (1); 

    // put valid floating point data into the source buffer 
    // to avoid performance penalty 
    for (index = 0; index < numberOfFloats; index++) 
     x [index] = (float) index, y [index] = 0; 

    startCount = queryPerformanceCounter(); 
    if (nonTemporal) 
     testNontemporal (x, y, numberOfVectors); 
    else 
     testTemporal (x, y, numberOfVectors); 
    elapsed = queryPerformanceCounter() - startCount; 
    gbPerSecond = (double) numberOfBytes/0x40000000 * queryPerformanceFrequency()/elapsed; 
    printf ("%.2f GB/s\n", gbPerSecond); 
    _mm_free (x); 
    _mm_free (y); 
    } 

//--------------------------------------------------------------------------- 

int main (void) 
    { 
    // raise our priority to increase measurement accuracy 
    SetPriorityClass (GetCurrentProcess(), REALTIME_PRIORITY_CLASS); 

    printf ("using temporal stores\n"); 
    runtests (0); 
    printf ("using non-temporal stores\n"); 
    runtests (1); 
    return 0; 
    } 

//---------------------------------------------------------------------------

輸出：

using temporal stores 
5.57 GB/s 
using non-temporal stores 
8.35 GB/s

來源

2014-01-18 17:49:32 ScottD

感謝您的回覆。我已經在使用大小爲400MB的緩衝區（比我係統中的任何緩存大得多）。此外，爲了讀取一些硬件計數器，我已經安裝了代碼，結果是確鑿的（即，使用stream_ps導致l2次寫入未命中）。但是，我仍然無法解釋這兩個版本之間的巨大性能差異。 – user1829358

我將添加一個示例基準，試圖顯示大緩衝區的非時間（流）優勢。這有點快，很髒，但我認爲是正確的。使用非便攜式（Windows）定時功能。 – ScottD

我已經更新了原始文章，但是我無法重現您的結果（即使我將您的代碼移植到Linux中，結果也保持不變）。你知道爲什麼會出現這種情況嗎？此外，你有解釋爲什麼不對齊版本更快？這實際上可能指向真正的問題，因爲流確實需要對齊。 – user1829358

據我所知，非臨時存儲從所有緩存相應的目標緩存行。如果線條在自然掉落之前再次觸摸，那麼您已經非常困難。

來源

2014-01-18 17:49:41 gsg

但是這不是這個代碼片段的情況，是嗎？ – user1829358

我想說這取決於有問題的緩衝區的大小，但是您對ScottD的評論表明它們非常大。此時我不確定發生了什麼事。你可能會嘗試各種各樣的東西，比如註釋'#pragma'，用'-xAVX'等編譯，尋找常規和非臨時商店之間性能比例的變化。 – gsg

流內在降低性能

回答

相關問題