我已經實現了program使用SSE2分別比較AVX2和SSE2的vpsadbw
指令和psadbw
。以下代碼是SSE2程序:悲傷教學這種奇怪行爲的原因是什麼?
#define MAX1 4096
#define MAX2 MAX1
#define MAX3 MAX1
#define NUM_LOOP 1000000000
double pTime = 0, mTime = 5;
//global data for sequentila matrix operations
unsigned char a_char[MAX1][MAX2] __attribute__((aligned(16)));
unsigned char b_char[MAX2][MAX3] __attribute__((aligned(16)));
unsigned char c_char[MAX1][MAX3] __attribute__((aligned(16)));
unsigned short int temp[8];
int main()
{
int i, j, w=0, sad=0;
struct timespec tStart, tEnd;
double tTotal , tBest=10000;
__m128i vec1, vec2, vecT, sad_total;
sad_total= _mm_setzero_si128();
do{
clock_gettime(CLOCK_MONOTONIC,&tStart);
for(i=0; i<MAX1; i++){
for(j=0; j<MAX2; j+=16){
vec1 = _mm_load_si128((__m128i *)&a_char[i][j]);
vec2 = _mm_load_si128((__m128i *)&b_char[i][j]);
vecT = _mm_sad_epu8(vec1 , vec2);
sad_total = _mm_add_epi64(vecT, sad_total);
}
}
_mm_store_si128((__m128i *)&temp[0], sad_total);
sad=temp[0]+temp[2]+temp[4]+temp[6];
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec)/1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
pTime += tTotal;
} while(w++ < NUM_LOOP && pTime < mTime);
printf(" The best time: %lf sec in %d repetition for %dX result is %d matrix\n",tBest,w, MAX1, sad);
return 0;
}
我使用gcc
,skylake
,Linux mint
當我生成彙編代碼內環包含一些不必要的移動操作爲SSE2如下:
.L26:
vmovdqa xmm1, XMMWORD PTR a_char[rcx+rax]
vpsadbw xmm1, xmm1, XMMWORD PTR b_char[rcx+rax]
add rax, 16
vpaddq xmm3, xmm1, XMMWORD PTR [rsp]
cmp rax, 4096
vmovaps XMMWORD PTR [rsp], xmm3
jne .L26
由於AVX2生成此彙編代碼:
.L26:
vmovdqa ymm1, YMMWORD PTR a_char[rcx+rax]
vpsadbw ymm1, ymm1, YMMWORD PTR b_char[rcx+rax]
add rax, 32
vpaddq ymm2, ymm2, ymm1
cmp rax, 4096
jne .L26
我不知道那些2 move instruc的原因重大違反了表現。
好的,我不知道它不可訪問。 – Martin