3
#define Size 50000
void main()
{
unsigned char *arry1 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned char *arry2 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned int *result = (unsigned int*)malloc(sizeof(unsigned int)* Size);
for (int i = 0; i < 16; i++)
{
arry1[i] = i;
arry2[i] = i;
}
__m128i Z = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
//__m128i dummy = _mm_setzero_si128();
for (int j = 0; j < 16; j += 16)
{
//printf("%d\n\n", j);
__m128i test1 = _mm_setzero_si128();
test1 = _mm_loadu_si128((__m128i*)&arry1[j]);
__m128i test2 = _mm_setzero_si128();
test2 = _mm_loadu_si128((__m128i*)&arry2[j]);
__m128i s16L = _mm_unpacklo_epi8(test1, Z);
__m128i s16H = _mm_unpackhi_epi8(test1, Z);
__m128i s32LL = _mm_unpacklo_epi16(s16L, Z);
__m128i s32LH = _mm_unpackhi_epi16(s16L, Z);
__m128i s32HL = _mm_unpacklo_epi16(s16H, Z);
__m128i s32HH = _mm_unpackhi_epi16(s16H, Z);
__m128i t16L = _mm_unpacklo_epi8(test2, Z);
__m128i t16H = _mm_unpackhi_epi8(test2, Z);
__m128i t32LL = _mm_unpacklo_epi16(t16L, Z);
__m128i t32LH = _mm_unpackhi_epi16(t16L, Z);
__m128i t32HL = _mm_unpacklo_epi16(t16H, Z);
__m128i t32HH = _mm_unpackhi_epi16(t16H, Z);
__m128 s1 = _mm_cvtepi32_ps(s32LL);
__m128 s2 = _mm_cvtepi32_ps(s32LH);
__m128 s3 = _mm_cvtepi32_ps(s32HL);
__m128 s4 = _mm_cvtepi32_ps(s32HH);
__m128 t1 = _mm_cvtepi32_ps(t32LL);
__m128 t2 = _mm_cvtepi32_ps(t32LH);
__m128 t3 = _mm_cvtepi32_ps(t32HL);
__m128 t4 = _mm_cvtepi32_ps(t32HH);
s1 = _mm_mul_ps(s1, t1);
s2 = _mm_mul_ps(s2, t2);
s3 = _mm_mul_ps(s3, t3);
s4 = _mm_mul_ps(s4, t4);
s1 = _mm_hadd_ps(s1, s2);//41,13
s3 = _mm_hadd_ps(s3, s4); //313,221
vsum = _mm_cvtps_epi32(s3);
for (int k = 0; k < 16; k++)
{
printf("%u\n", (unsigned char)vsum.m128i_i8[k]);
}
s1 = _mm_hadd_ps(s1, s3); //734, 14
s1 = _mm_hadd_ps(s1, s1); //1100,140
s1 = _mm_hadd_ps(s1, s1); //1240
}
}
我使用sse進行點生成。我正在使用_mm_mul_ps
和_mm_hadd_ps
指令不是_mm_dp_ps
。 如果_mm_hadd_ps
函數超過255後的值將顯示錯誤的值。例如s3
的正確值是{0,0,0,421,0,0,0,313,0,0,0,221,0,0,0,145}
。 但是打印{0,0,1,165,0,0,1,57,0,0,0,221,0,0,0,145}
。這是我宣佈的結果arry1
,arry2
爲unsigned char
?我知道255是8位的最大值。使用sse的點生成
's3'爲4×32位整數,但正在顯示它作爲16×8位無符號字符。 –