我試圖加速使用英特爾編譯器的自動矢量化和使用sse的一些代碼。 所有計算都將一些struct node_t轉換爲另一個struct w_t(函數tr()和gen_tr())。 當我嘗試矢量化函數gen_tr()時,它不會產生任何效果。使用自動矢量化和sse依賴數據大小的速度加快
如果更改數據存儲格式,當每個struct組件存儲在不同的float數組中時,則自動向量化運行良好,請參閱函數genv_tr()。
使用sse的函數稱爲ssev_tr(N應該均勻地除以4)。
transform.c:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
static __inline__ unsigned long getCC(void)
{
unsigned a, d;
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return ((unsigned long)a) | (((unsigned long)d) << 32);
}
typedef struct {
float x1, x2, x3, x4, x5;
} node_t;
typedef struct {
float w1, w2, w3, w4;
} w_t;
void tr(node_t *n, float c1, float c2, w_t *w)
{
const float nv = n->x1;
const float N00T = n->x3 * c1;
const float n1v = n->x2;
const float N01T = n->x4 * c2;
w->w1 = nv - N00T;
w->w2 = nv + N00T;
w->w3 = n1v - N01T;
w->w4 = n1v + N01T;
}
__attribute__ ((noinline))
void gen_tr(node_t *n, w_t *w, const int N, float c1, float c2)
{
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N; i++) {
tr(n + i, c1, c2, w + i);
}
}
__attribute__ ((noinline))
void genv_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N; i++) {
const float N00T = x3[i] * c1;
const float N01T = x4[i] * c2;
w1[i] = x1[i] - N00T;
w2[i] = x1[i] + N00T;
w3[i] = x2[i] - N01T;
w4[i] = x2[i] + N01T;
}
}
__attribute__ ((noinline))
void ssev_tr(float *x1, float *x2, float *x3, float *x4, float *x5, float *w1, float *w2, float *w3, float *w4, const int N, float c1, float c2)
{
__m128 *ws1 = (__m128*)w1;
__m128 *ws2 = (__m128*)w2;
__m128 *ws3 = (__m128*)w3;
__m128 *ws4 = (__m128*)w4;
__m128 *xs1 = (__m128*)x1;
__m128 *xs2 = (__m128*)x2;
__m128 *xs3 = (__m128*)x3;
__m128 *xs4 = (__m128*)x4;
const __m128 cs1 = _mm_set1_ps(c1);
const __m128 cs2 = _mm_set1_ps(c2);
int i;
#pragma vector aligned
#pragma ivdep
for (i = 0; i < N/4; i++) {
const __m128 N00T = _mm_mul_ps(xs3[i], cs1);
const __m128 N01T = _mm_mul_ps(xs4[i], cs2);
ws1[i] = _mm_sub_ps(xs1[i], N00T);
ws2[i] = _mm_add_ps(xs1[i], N00T);
ws3[i] = _mm_sub_ps(xs2[i], N01T);
ws4[i] = _mm_add_ps(xs2[i], N01T);
}
}
#define test(func) \
for (i = 0; i < n; i++) { \
x[i].x1 = 1.0; \
x[i].x2 = 2.0; \
x[i].x3 = 2.0; \
x[i].x4 = 2.0; \
x[i].x5 = 2.0; \
} \
\
t1 = getCC(); \
for (i = 0; i < rep; i++) { \
func(x, w, n, c1, c2); \
} \
t2 = getCC(); \
printf("\t%f", ((double)(t2 - t1))/n/rep);
#define test1(func) \
for (i = 0; i < n; i++) { \
x1[i] = 1.0; \
x2[i] = 2.0; \
x3[i] = 2.0; \
x4[i] = 2.0; \
x5[i] = 2.0; \
} \
\
t1 = getCC(); \
for (i = 0; i < rep; i++) { \
func(x1, x2, x3, x4, x5, w1, w2, w3, w4, n, c1, c2); \
} \
t2 = getCC(); \
printf("\t%f", ((double)(t2 - t1))/n/rep);
int main(int argc, char *argv[])
{
if (argc < 2) {
printf("Usage %s vector_size\n", argv[0]);
}
int n = atoi(argv[1]);
printf("%d", n);
int rep = 100000000/n;
int i;
int inc = 1;
float c1 = 2.0, c2 = 1.0;
unsigned long t1, t2;
node_t *x = (node_t*)malloc(n * sizeof(node_t));
w_t *w = (w_t*)malloc(n * sizeof(w_t));
float *x1 = (float*)malloc(n * sizeof(float));
float *x2 = (float*)malloc(n * sizeof(float));
float *x3 = (float*)malloc(n * sizeof(float));
float *x4 = (float*)malloc(n * sizeof(float));
float *x5 = (float*)malloc(n * sizeof(float));
float *w1 = (float*)malloc(n * sizeof(float));
float *w2 = (float*)malloc(n * sizeof(float));
float *w3 = (float*)malloc(n * sizeof(float));
float *w4 = (float*)malloc(n * sizeof(float));
test(gen_tr);
test1(genv_tr);
test1(ssev_tr);
printf("\n");
return 0;
}
編譯選項:ICC -O3 -Wall -W -vec-報告6 transform.c -o變換ICC的
版本 - 12.1.2,OS - Fedora的16個x86_64,CPU - Intel Core2 Quad CPU Q8200。
然後我與步驟64中具有不同的大小運行它從16〜3000,在這裏的腳本:
#!/bin/bash
echo "" > run.log
for ((c=16;c<3000;c+=64))
do
./transform $c | tee -a run.log
done
這裏工作這個腳本(大小,gen_tr,genv_tr,ssev_tr)的一些結果,每顯示的所有時間一個數組元素:
16 7.710743 3.168577 3.253829
272 7.166493 1.983918 2.618569
528 7.121866 1.920195 2.567109
784 7.115007 1.899451 2.549645
1040 8.104026 2.481062 2.944317
1296 8.137537 5.105032 5.104614
1552 8.118534 5.068812 5.064211
1808 8.138309 5.077831 5.085015
2064 8.149699 5.107503 5.069958
2320 8.164556 5.080981 5.099313
2576 8.151524 5.086056 5.089294
2832 8.212946 5.061927 5.072261
爲什麼在使用矢量化版本的函數時它的尺寸大小如此重大?這是因爲緩存未命中嗎?是否可以在所有數據範圍內保存相同的速度?