2017-09-03 84 views
8

高效方式如何設置到1有效地與AVX2設置第一N或__m256i的1最後N個比特,其餘爲0

  1. 第一N
  2. 最後N

__m256i,其餘設置爲0

這些是2個單獨的操作,用於位範圍的尾部和頭部,當範圍可以在__m256i值的中間開始和結束時。範圍中佔據完整__m256i值的部分用全部 - 0或全部1掩碼進行處理。

回答

9

AVX2移位指令vpsllvdvpsrlvd具有很好的屬性,移位計數 大於或等於32導致ymm寄存器中的零整數。 換句話說:移位計數沒有被屏蔽,相比之下, 與x86標量移位指令的移位計數相對。

因此該代碼是相當簡單:

/* 
gcc -O3 -m64 -Wall -mavx2 -march=broadwell avx2_bit_mask.c 
*/ 
#include <immintrin.h> 
#include <stdio.h> 

__m256i bit_mask_avx2_msb(unsigned int n)  
{   
    __m256i ones  = _mm256_set1_epi32(-1); 
    __m256i cnst32_256 = _mm256_set_epi32(32,64,96,128, 160,192,224,256); 

    __m256i shift  = _mm256_set1_epi32(n); 
      shift  = _mm256_subs_epu16(cnst32_256,shift); 
        return _mm256_sllv_epi32(ones,shift);   
} 


__m256i bit_mask_avx2_lsb(unsigned int n)    
{   
    __m256i ones  = _mm256_set1_epi32(-1); 
    __m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32); 

    __m256i shift  = _mm256_set1_epi32(n); 
      shift  = _mm256_subs_epu16(cnst32_256,shift); 
        return _mm256_srlv_epi32(ones,shift); 
} 


int print_avx2_hex(__m256i ymm) 
{ 
    long unsigned int x[4]; 
     _mm256_storeu_si256((__m256i*)x,ymm); 
     printf("%016lX %016lX %016lX %016lX\n", x[3],x[2],x[1],x[0]); 

    return 0; 
} 


int main() 
{ 
    unsigned int i; 

    for (i=0;i<259;i++){ 
     printf("bit_mask_avx2_lsb(%3d) ",i); 
     print_avx2_hex(bit_mask_avx2_lsb(i)); 
    } 
    printf("\n"); 

    for (i=0;i<259;i++){ 
     printf("bit_mask_avx2_msb(%3d) ",i); 
     print_avx2_hex(bit_mask_avx2_msb(i)); 
    } 
    printf("\n"); 


    return 0; 
} 

的結果是:

$ ./a.out 
bit_mask_avx2_lsb( 0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_lsb( 1) 0000000000000000 0000000000000000 0000000000000000 0000000000000001 
bit_mask_avx2_lsb( 2) 0000000000000000 0000000000000000 0000000000000000 0000000000000003 
bit_mask_avx2_lsb( 3) 0000000000000000 0000000000000000 0000000000000000 0000000000000007 
bit_mask_avx2_lsb( 4) 0000000000000000 0000000000000000 0000000000000000 000000000000000F 
bit_mask_avx2_lsb( 5) 0000000000000000 0000000000000000 0000000000000000 000000000000001F 
bit_mask_avx2_lsb( 6) 0000000000000000 0000000000000000 0000000000000000 000000000000003F 
bit_mask_avx2_lsb( 7) 0000000000000000 0000000000000000 0000000000000000 000000000000007F 
bit_mask_avx2_lsb( 8) 0000000000000000 0000000000000000 0000000000000000 00000000000000FF 
bit_mask_avx2_lsb( 9) 0000000000000000 0000000000000000 0000000000000000 00000000000001FF 
bit_mask_avx2_lsb(10) 0000000000000000 0000000000000000 0000000000000000 00000000000003FF 
bit_mask_avx2_lsb(11) 0000000000000000 0000000000000000 0000000000000000 00000000000007FF 
... 
bit_mask_avx2_lsb(124) 0000000000000000 0000000000000000 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(125) 0000000000000000 0000000000000000 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(126) 0000000000000000 0000000000000000 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(127) 0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(128) 0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(129) 0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(130) 0000000000000000 0000000000000003 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(131) 0000000000000000 0000000000000007 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(132) 0000000000000000 000000000000000F FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
... 
bit_mask_avx2_lsb(248) 00FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(249) 01FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(250) 03FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(251) 07FFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(252) 0FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(253) 1FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(254) 3FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(255) 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_lsb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 



bit_mask_avx2_msb( 0) 0000000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 1) 8000000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 2) C000000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 3) E000000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 4) F000000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 5) F800000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 6) FC00000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 7) FE00000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 8) FF00000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb( 9) FF80000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb(10) FFC0000000000000 0000000000000000 0000000000000000 0000000000000000 
bit_mask_avx2_msb(11) FFE0000000000000 0000000000000000 0000000000000000 0000000000000000 
... 
bit_mask_avx2_msb(124) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0 0000000000000000 0000000000000000 
bit_mask_avx2_msb(125) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8 0000000000000000 0000000000000000 
bit_mask_avx2_msb(126) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC 0000000000000000 0000000000000000 
bit_mask_avx2_msb(127) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE 0000000000000000 0000000000000000 
bit_mask_avx2_msb(128) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 0000000000000000 0000000000000000 
bit_mask_avx2_msb(129) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 8000000000000000 0000000000000000 
bit_mask_avx2_msb(130) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF C000000000000000 0000000000000000 
bit_mask_avx2_msb(131) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF E000000000000000 0000000000000000 
bit_mask_avx2_msb(132) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF F000000000000000 0000000000000000 
... 
bit_mask_avx2_msb(248) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF00 
bit_mask_avx2_msb(249) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFF80 
bit_mask_avx2_msb(250) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFC0 
bit_mask_avx2_msb(251) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFE0 
bit_mask_avx2_msb(252) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF0 
bit_mask_avx2_msb(253) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFF8 
bit_mask_avx2_msb(254) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFC 
bit_mask_avx2_msb(255) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFE 
bit_mask_avx2_msb(256) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_msb(257) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 
bit_mask_avx2_msb(258) FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF 

對於值n,具有256 < = n < = 65535,所有位都被設定爲一,正如人們所料。 65535的上限是由於_mm256_subs_epu16()的16位飽和算術。 With n = 65536 bitmask(輸出值)爲零。可以修改代碼,使得所有位設置爲一個 ,範圍爲256 < = n < = INT_MAX。 這可以通過替換 shift = _mm256_subs_epu16(cnst32_256,shift);

__m256i mask  = _mm256_cmpgt_epi32(cnst32_256,shift); 
      shift  = _mm256_sub_epi32(cnst32_256,shift); 
      shift  = _mm256_and_si256(shift,mask); 

來實現這三個內在或多或少模仿_mm256_subs_epu32(cnst32_256,shift),它不存在。

+1

非常酷。這隻比滑動窗口未對齊加載技術稍微貴一些,對於字節掩碼(而不是位掩碼)來說效果很好。在某些情況下甚至可能更可取。 –

相關問題