2012-02-13 37 views
1

我試圖加速使用英特爾編譯器的自動矢量化和使用sse的一些代碼。 所有計算都將一些struct node_t轉換爲另一個struct w_t(函數tr()和gen_tr())。 當我嘗試矢量化函數gen_tr()時,它不會產生任何效果。使用自動矢量化和sse依賴數據大小的速度加快

如果更改數據存儲格式,當每個struct組件存儲在不同的float數組中時,則自動向量化運行良好,請參閱函數genv_tr()。

使用sse的函數稱爲ssev_tr(N應該均勻地除以4)。

transform.c:

#include <stdio.h> 
#include <stdlib.h> 
#include <malloc.h> 
#include <xmmintrin.h> 

static __inline__ unsigned long getCC(void) 
{ 
    unsigned a, d; 
    asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
    return ((unsigned long)a) | (((unsigned long)d) << 32); 
} 

typedef struct { 
    float x1, x2, x3, x4, x5; 
} node_t; 

typedef struct { 
    float w1, w2, w3, w4; 
} w_t; 

void tr(node_t *n, float c1, float c2, w_t *w) 
{ 
    const float nv = n->x1; 
    const float N00T = n->x3 * c1; 

    const float n1v = n->x2; 
    const float N01T = n->x4 * c2; 

    w->w1 = nv - N00T; 
    w->w2 = nv + N00T; 
    w->w3 = n1v - N01T; 
    w->w4 = n1v + N01T; 
} 

__attribute__ ((noinline)) 
void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2) 
{ 
    int i; 
    #pragma vector aligned 
    #pragma ivdep 
    for (i = 0; i < N; i++) { 
     tr(n + i, c1, c2, w + i); 
    } 
} 

__attribute__ ((noinline)) 
void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2) 
{ 
    int i; 
    #pragma vector aligned 
    #pragma ivdep 
    for (i = 0; i < N; i++) { 
     const float N00T = x3[i] * c1; 
     const float N01T = x4[i] * c2; 

     w1[i] = x1[i] - N00T; 
     w2[i] = x1[i] + N00T; 
     w3[i] = x2[i] - N01T; 
     w4[i] = x2[i] + N01T; 
    } 
} 

__attribute__ ((noinline)) 
void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2) 
{ 
    __m128 *ws1 = (__m128*)w1; 
    __m128 *ws2 = (__m128*)w2; 
    __m128 *ws3 = (__m128*)w3; 
    __m128 *ws4 = (__m128*)w4; 

    __m128 *xs1 = (__m128*)x1; 
    __m128 *xs2 = (__m128*)x2; 
    __m128 *xs3 = (__m128*)x3; 
    __m128 *xs4 = (__m128*)x4; 

    const __m128 cs1 = _mm_set1_ps(c1); 
    const __m128 cs2 = _mm_set1_ps(c2); 

    int i; 
    #pragma vector aligned 
    #pragma ivdep 
    for (i = 0; i < N/4; i++) { 
     const __m128 N00T = _mm_mul_ps(xs3[i], cs1); 
     const __m128 N01T = _mm_mul_ps(xs4[i], cs2); 

     ws1[i] = _mm_sub_ps(xs1[i], N00T); 
     ws2[i] = _mm_add_ps(xs1[i], N00T); 
     ws3[i] = _mm_sub_ps(xs2[i], N01T); 
     ws4[i] = _mm_add_ps(xs2[i], N01T); 
    } 
} 

#define test(func) \ 
    for (i = 0; i < n; i++) { \ 
     x[i].x1 = 1.0; \ 
     x[i].x2 = 2.0; \ 
     x[i].x3 = 2.0; \ 
     x[i].x4 = 2.0; \ 
     x[i].x5 = 2.0; \ 
    } \ 
    \ 
    t1 = getCC(); \ 
    for (i = 0; i < rep; i++) { \ 
     func(x, w, n, c1, c2); \ 
    } \ 
    t2 = getCC(); \ 
    printf("\t%f", ((double)(t2 - t1))/n/rep); 

#define test1(func) \ 
    for (i = 0; i < n; i++) { \ 
     x1[i] = 1.0; \ 
     x2[i] = 2.0; \ 
     x3[i] = 2.0; \ 
     x4[i] = 2.0; \ 
     x5[i] = 2.0; \ 
    } \ 
    \ 
    t1 = getCC(); \ 
    for (i = 0; i < rep; i++) { \ 
     func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \ 
    } \ 
    t2 = getCC(); \ 
    printf("\t%f", ((double)(t2 - t1))/n/rep); 

int main(int argc, char *argv[]) 
{ 
    if (argc < 2) { 
     printf("Usage %s vector_size\n", argv[0]); 
    } 
    int n = atoi(argv[1]); 
    printf("%d", n); 
    int rep = 100000000/n; 
    int i; 
    int inc = 1; 
    float c1 = 2.0, c2 = 1.0; 
    unsigned long t1, t2; 
    node_t *x = (node_t*)malloc(n * sizeof(node_t)); 
    w_t *w = (w_t*)malloc(n * sizeof(w_t)); 

    float *x1 = (float*)malloc(n * sizeof(float)); 
    float *x2 = (float*)malloc(n * sizeof(float)); 
    float *x3 = (float*)malloc(n * sizeof(float)); 
    float *x4 = (float*)malloc(n * sizeof(float)); 
    float *x5 = (float*)malloc(n * sizeof(float)); 

    float *w1 = (float*)malloc(n * sizeof(float)); 
    float *w2 = (float*)malloc(n * sizeof(float)); 
    float *w3 = (float*)malloc(n * sizeof(float)); 
    float *w4 = (float*)malloc(n * sizeof(float)); 

    test(gen_tr); 
    test1(genv_tr); 
    test1(ssev_tr); 

    printf("\n"); 
    return 0; 
} 

編譯選項:ICC -O3 -Wall -W -vec-報告6 transform.c -o變換ICC的

版本 - 12.1.2,OS - Fedora的16個x86_64,CPU - Intel Core2 Quad CPU Q8200。

然後我與步驟64中具有不同的大小運行它從16〜3000,在這裏的腳本:

#!/bin/bash 

echo "" > run.log 

for ((c=16;c<3000;c+=64)) 
do 
./transform $c | tee -a run.log 
done 

這裏工作這個腳本(大小,gen_tr,genv_tr,ssev_tr)的一些結果,每顯示的所有時間一個數組元素:

16  7.710743  3.168577  3.253829 
272  7.166493  1.983918  2.618569 
528  7.121866  1.920195  2.567109 
784  7.115007  1.899451  2.549645 
1040 8.104026  2.481062  2.944317 
1296 8.137537  5.105032  5.104614 
1552 8.118534  5.068812  5.064211 
1808 8.138309  5.077831  5.085015 
2064 8.149699  5.107503  5.069958 
2320 8.164556  5.080981  5.099313 
2576 8.151524  5.086056  5.089294 
2832 8.212946  5.061927  5.072261 

爲什麼在使用矢量化版本的函數時它的尺寸大小如此重大?這是因爲緩存未命中嗎?是否可以在所有數據範圍內保存相同的速度?

回答

1

你有8個浮點數組。當它們的大小爲1000時,您的測試正在操縱大約32kB的數據。即使您的L1緩存可能有點大(64kB),由於關聯性,L1緩存可能無法同時保存所有32kB數據。

您的測試迭代,一遍又一遍地處理相同的數據。考慮兩種情況:

  • 尺寸= 528:8個陣列方便地裝配到L1高速緩存。每個測試迭代(第一個除外)都可以快速訪問數據。
  • Size = 1268:8個陣列不能同時放入L1緩存。每次測試迭代都會從L1中清除數據,因此所有讀取和寫入操作都有效地進入L2。

所以輸入大小1000的跳轉部分是您的測試的人工產物,但並不完全。在現實世界中,如果你已經有了L1緩存中需要的所有數據,genv_tr將會非常快。但是對於大於1000的輸入,所有的輸入都不適合L1緩存,所以一些訪問肯定會進入L2。