2016-01-13 42 views
1

我試圖對未對齊到字節邊界的倍數進行一些打包。這是我想要做的。在未對齊的字節邊界上有效地打包10位數據

我有一個512位數組(8個64位整數)的數據。該陣列內部的10位數據對齊到2個字節。我需要做的是將512位下降到僅10位數據(5個64位整數)的320位。

我可以想到手動方式來做512位數組的每個2字節部分,屏蔽掉10位,或者它考慮到字節邊界,並創建輸出64位整數。這樣的事情:

void pack512to320bits(uint64 (&array512bits)[8], uint64 (&array320bits)[5]) 
{ 
    array320bits[0] = (array512bits[0] & maskFor10bits) | ((array512bits[0] & (maskFor10bits << 16)) << 10) | 
        ((array512bits[0] & (maskFor10bits << 32)) << 20) | ((array512bits[0] << 48) << 30) | 
        ((arrayFor512bits[1] & (maskFor10bits)) << 40) | ((arrayFor512bits[1] & (maskFor10bits << 16)) << 50) | 
        ((arrayFor512bits[1] & (0xF << 32)) << 60); 
    array320bits[1] = 0; 
    array320bits[2] = 0; 
    array320bits[3] = 0; 
    array320bits[4] = 0; 
} 

我知道這將工作,但它似乎容易出錯,並不容易擴展到更大的字節序列。

或者我可以通過輸入數組,將所有10位值去除到一個向量中,然後在最後連接它們,再次確保我對齊字節邊界。類似這樣的:

void pack512to320bits(uint64 (&array512bits)[8], uint64 (&array320bits)[5]) 
{ 
    static uint64 maskFor10bits = 0x3FF; 
    std::vector<uint16> maskedPixelBytes(8 * 4); 

    for (unsigned int qword = 0; qword < 8; ++qword) 
    { 
     for (unsigned int pixelBytes = 0; pixelBytes < 4; ++pixelBytes) 
     { 
     maskedPixelBytes[qword * 4 + pixelBytes] = (array512bits[qword] & (maskFor10bits << (16 * pixelbytes))); 
     } 
    } 
    array320bits[0] = maskedPixelBytes[0] | (maskedPixelBytes[1] << 10) | (maskedPixelBytes[2] << 20) | (maskedPixelBytes[3] << 30) | 
        (maskedPixelBytes[4] << 40) | (maskedPixelBytes[5] << 50) | (maskedPixelBytes[6] << 60); 
    array320bits[1] = (maskedPixelBytes[6] >> 4) | (maskedPixelBytes[7] << 6) ... 


    array320bits[2] = 0; 
    array320bits[3] = 0; 
    array320bits[4] = 0; 
} 

這種方式有點容易調試/讀取,但效率低下,不能擴展到更大的字節序列。我想知道是否有一種更容易/算法的方法來做這種比特打包。

+1

你想提取任意長度,還是總是固定在16位嵌入10位?如果是這樣的話,你可以處理4個批次,每個10位打包成40位(5個8位字節)(然後迭代接下來的40位)。 –

回答

2

你可以做什麼,但它取決於某些條件和你認爲有效的。首先,如果2個數組總是1個512位和1個320位數組,也就是說,如果要傳遞的數組將始終爲uint64 (&array512bits)[8]uint64 (&array320bits)[5],那麼它實際上是命令,如果幅度更高效的硬編碼填充。

如果要採取更大的字節序列考慮雖然,可以創建一個算法,需要填充進去並移動離開相應的比特然後通過較大的比特陣列的uint64值迭代。然而,採用這種方法會在程序集中引入增加計算時間的分支(例如if (total_shifted < bit_size)等)。即使進行了優化,生成的程序集仍然比手動進行輪班更復雜,而要做到這一點的代碼需要考慮每個數組的大小,以確保它們可以適當地適合彼此,從而增加更多計算時間(或通用代碼複雜度)。

作爲示例,考慮這個手動換檔的代碼:

static void pack512to320_manual(uint64 (&a512)[8], uint64 (&a320)[5]) 
{ 
    a320[0] = (
     (a512[0] & 0x00000000000003FF)   | // 10 -> 10 
     ((a512[0] & 0x0000000003FF0000) >> 6) | // 10 -> 20 
     ((a512[0] & 0x000003FF00000000) >> 12) | // 10 -> 30 
     ((a512[0] & 0x03FF000000000000) >> 18) | // 10 -> 40 
     ((a512[1] & 0x00000000000003FF) << 40) | // 10 -> 50 
     ((a512[1] & 0x0000000003FF0000) << 34) | // 10 -> 60 
     ((a512[1] & 0x0000000F00000000) << 28)); // 4 -> 64 

    a320[1] = (
     ((a512[1] & 0x000003F000000000) >> 36) | // 6 -> 6 
     ((a512[1] & 0x03FF000000000000) >> 42) | // 10 -> 16 
     ((a512[2] & 0x00000000000003FF) << 16) | // 10 -> 26 
     ((a512[2] & 0x0000000003FF0000) << 10) | // 10 -> 36 
     ((a512[2] & 0x000003FF00000000) << 4) | // 10 -> 46 
     ((a512[2] & 0x03FF000000000000) >> 2) | // 10 -> 56 
     ((a512[3] & 0x00000000000000FF) << 56)); // 8 -> 64 

    a320[2] = (
     ((a512[3] & 0x0000000000000300) >> 8) | // 2 -> 2 
     ((a512[3] & 0x0000000003FF0000) >> 14) | // 10 -> 12 
     ((a512[3] & 0x000003FF00000000) >> 20) | // 10 -> 22 
     ((a512[3] & 0x03FF000000000000) >> 26) | // 10 -> 32 
     ((a512[4] & 0x00000000000003FF) << 32) | // 10 -> 42 
     ((a512[4] & 0x0000000003FF0000) << 26) | // 10 -> 52 
     ((a512[4] & 0x000003FF00000000) << 20) | // 10 -> 62 
     ((a512[4] & 0x0003000000000000) << 14)); // 2 -> 64 

    a320[3] = (
     ((a512[4] & 0x03FC000000000000) >> 50) | // 8 -> 8 
     ((a512[5] & 0x00000000000003FF) << 8) | // 10 -> 18 
     ((a512[5] & 0x0000000003FF0000) << 2) | // 10 -> 28 
     ((a512[5] & 0x000003FF00000000) >> 4) | // 10 -> 38 
     ((a512[5] & 0x03FF000000000000) >> 10) | // 10 -> 48 
     ((a512[6] & 0x00000000000003FF) << 48) | // 10 -> 58 
     ((a512[6] & 0x00000000003F0000) << 42)); // 6 -> 64 

    a320[4] = (
     ((a512[6] & 0x0000000003C00000) >> 22) | // 4 -> 4 
     ((a512[6] & 0x000003FF00000000) >> 28) | // 10 -> 14 
     ((a512[6] & 0x03FF000000000000) >> 34) | // 10 -> 24 
     ((a512[7] & 0x00000000000003FF) << 24) | // 10 -> 34 
     ((a512[7] & 0x0000000003FF0000) << 18) | // 10 -> 44 
     ((a512[7] & 0x000003FF00000000) << 12) | // 10 -> 54 
     ((a512[7] & 0x03FF000000000000) << 6)); // 10 -> 64 
} 

該代碼將只接受uint64類型將裝配到彼此考慮10位邊界和陣列移動相應,使得512位陣列被打包到320位陣列中,因此在編譯時做類似uint64* a512p = a512; pack512to320_manual(a512p, a320);的操作將失敗,因爲a512p不是uint64 (&)[8](即類型安全)。請注意,此代碼完全展開以顯示位移序列,但您可以使用#defineenum來避免「幻數」並使代碼更清晰。

如果你想擴大這種採取更大的字節序列考慮進去,你可以這樣做以下:

template < std::size_t X, std::size_t Y > 
static void pack512to320_loop(const uint64 (&array512bits)[X], uint64 (&array320bits)[Y]) 
{ 
    const uint64* start = array512bits; 
    const uint64* end = array512bits + (X-1); 
    uint64 tmp = *start; 
    uint64 tmask = 0; 
    int i = 0, tot = 0, stot = 0, rem = 0, z = 0; 
    bool excess = false; 
    while (start <= end) { 
     while (stot < bit_size) { 
      array320bits[i] |= ((tmp & 0x00000000000003FF) << tot); 
      tot += 10; // increase shift left by 10 bits 
      tmp = tmp >> 16; // shift off 2 bytes 
      stot += 16; // increase shifted total 
      if ((excess = ((tot + 10) >= bit_size))) { break; } 
     } 
     if (stot == bit_size) { 
      tmp = *(++start); // get next value 
      stot = 0; 
     } 
     if (excess) { 
      rem = (bit_size - tot); // remainder bits to shift off 
      tot = 0; 
      // create the mask 
      tmask = 0; 
      for (z = 0; z < rem; ++z) { tmask |= (1 << z); } 
      // get the last bits 
      array320bits[i++] |= ((tmp & tmask) << (bit_size - rem)); 
      // shift off and adjust 
      tmp = tmp >> rem; 
      rem = (10 - rem); 
      // new mask 
      tmask = 0; 
      for (z = 0; z < rem; ++z) { tmask |= (1 << z); } 
      array320bits[i] = (tmp & tmask); 

      tot += rem; // increase shift left by remainder bits 
      tmp = tmp >> (rem + 6); // shift off 2 bytes 
      stot += 16; 
      excess = false; 
     } 
    } 
} 

此代碼還需要字節邊界進去,並將其打包爲512位陣列。然而,這段代碼不是做任何錯誤檢查,以確保大小將正確匹配,所以如果X % 8 != 0Y % 5 != 0(其中XY> 0),您可能會得到無效的結果!此外,由於涉及循環,臨時和移位,它比手動版本慢得多,因此,功能代碼的第一次讀取器可能需要更多時間來解密循環代碼的完整意圖和上下文,的位移版本。

如果你想要兩者之間的東西,你可以使用手動打包功能,並在8和5組中迭代較大的字節數組,以確保字節正確對齊;

template < std::size_t X, std::size_t Y > 
static void pack512to320_manual_loop(const uint64 (&array512bits)[X], uint64 (&array320bits)[Y]) 
{ 
    if (((X == 0) || (X % 8 != 0)) || ((Y == 0) || (Y % 5 != 0)) || ((X < Y) || (Y % X != Y))) { 
     // handle invalid sizes how you need here 
     std::cerr << "Invalid sizes!" << std::endl; 
     return; 
    } 
    uint64* a320 = array320bits; 
    const uint64* end = array512bits + (X-1); 
    for (const uint64* a512 = array512bits; a512 < end; a512 += 8) { 
     *a320 = (
      (a512[0] & 0x00000000000003FF)   | // 10 -> 10 
      ((a512[0] & 0x0000000003FF0000) >> 6) | // 10 -> 20 
      ((a512[0] & 0x000003FF00000000) >> 12) | // 10 -> 30 
      ((a512[0] & 0x03FF000000000000) >> 18) | // 10 -> 40 
      ((a512[1] & 0x00000000000003FF) << 40) | // 10 -> 50 
      ((a512[1] & 0x0000000003FF0000) << 34) | // 10 -> 60 
      ((a512[1] & 0x0000000F00000000) << 28)); // 4 -> 64 
     ++a320; 

     *a320 = (
      ((a512[1] & 0x000003F000000000) >> 36) | // 6 -> 6 
      ((a512[1] & 0x03FF000000000000) >> 42) | // 10 -> 16 
      ((a512[2] & 0x00000000000003FF) << 16) | // 10 -> 26 
      ((a512[2] & 0x0000000003FF0000) << 10) | // 10 -> 36 
      ((a512[2] & 0x000003FF00000000) << 4) | // 10 -> 46 
      ((a512[2] & 0x03FF000000000000) >> 2) | // 10 -> 56 
      ((a512[3] & 0x00000000000000FF) << 56)); // 8 -> 64 
     ++a320; 

     *a320 = (
      ((a512[3] & 0x0000000000000300) >> 8) | // 2 -> 2 
      ((a512[3] & 0x0000000003FF0000) >> 14) | // 10 -> 12 
      ((a512[3] & 0x000003FF00000000) >> 20) | // 10 -> 22 
      ((a512[3] & 0x03FF000000000000) >> 26) | // 10 -> 32 
      ((a512[4] & 0x00000000000003FF) << 32) | // 10 -> 42 
      ((a512[4] & 0x0000000003FF0000) << 26) | // 10 -> 52 
      ((a512[4] & 0x000003FF00000000) << 20) | // 10 -> 62 
      ((a512[4] & 0x0003000000000000) << 14)); // 2 -> 64 
     ++a320; 

     *a320 = (
      ((a512[4] & 0x03FC000000000000) >> 50) | // 8 -> 8 
      ((a512[5] & 0x00000000000003FF) << 8) | // 10 -> 18 
      ((a512[5] & 0x0000000003FF0000) << 2) | // 10 -> 28 
      ((a512[5] & 0x000003FF00000000) >> 4) | // 10 -> 38 
      ((a512[5] & 0x03FF000000000000) >> 10) | // 10 -> 48 
      ((a512[6] & 0x00000000000003FF) << 48) | // 10 -> 58 
      ((a512[6] & 0x00000000003F0000) << 42)); // 6 -> 64 
     ++a320; 

     *a320 = (
      ((a512[6] & 0x0000000003C00000) >> 22) | // 4 -> 4 
      ((a512[6] & 0x000003FF00000000) >> 28) | // 10 -> 14 
      ((a512[6] & 0x03FF000000000000) >> 34) | // 10 -> 24 
      ((a512[7] & 0x00000000000003FF) << 24) | // 10 -> 34 
      ((a512[7] & 0x0000000003FF0000) << 18) | // 10 -> 44 
      ((a512[7] & 0x000003FF00000000) << 12) | // 10 -> 54 
      ((a512[7] & 0x03FF000000000000) << 6)); // 10 -> 64 
     ++a320; 
    } 
} 

這類似於手動包裝功能和僅增加時間的瑣碎量爲檢查,但可以處理更大的陣列,將打包成彼此乾淨(再次,展開以顯示:類似於以下的東西序列)。

時序上面上的[email protected]使用-O3g++ 4.2.1實施例產生了這些平均時間:

pack512to320_loop:0.135我們

pack512to320_manual:0.0017我們

pack512to320_manual_loop:0.0020我們

而他再是用於測試的輸入/輸出和一般的計時測試代碼:

#include <iostream> 
#include <ctime> 
#if defined(_MSC_VER) 
    #include <cstdint> 
    #include <windows.h> 
    #define timesruct LARGE_INTEGER 
    #define dotick(v) QueryPerformanceCounter(&v) 
    timesruct freq; 
#else 
    #define timesruct struct timespec 
    #define dotick(v) clock_gettime(CLOCK_MONOTONIC, &v) 
#endif 

static const std::size_t bit_size = sizeof(uint64) * 8; 

template < std::size_t X, std::size_t Y > 
static void pack512to320_loop(const uint64 (&array512bits)[X], uint64 (&array320bits)[Y]) 
{ 
    const uint64* start = array512bits; 
    const uint64* end = array512bits + (X-1); 
    uint64 tmp = *start; 
    uint64 tmask = 0; 
    int i = 0, tot = 0, stot = 0, rem = 0, z = 0; 
    bool excess = false; 
    // this line is only here for validities sake, 
    // it was commented out during testing for performance 
    for (z = 0; z < Y; ++z) { array320bits[z] = 0; } 
    while (start <= end) { 
     while (stot < bit_size) { 
      array320bits[i] |= ((tmp & 0x00000000000003FF) << tot); 
      tot += 10; // increase shift left by 10 bits 
      tmp = tmp >> 16; // shift off 2 bytes 
      stot += 16; // increase shifted total 
      if ((excess = ((tot + 10) >= bit_size))) { break; } 
     } 
     if (stot == bit_size) { 
      tmp = *(++start); // get next value 
      stot = 0; 
     } 
     if (excess) { 
      rem = (bit_size - tot); // remainder bits to shift off 
      tot = 0; 
      // create the mask 
      tmask = 0; 
      for (z = 0; z < rem; ++z) { tmask |= (1 << z); } 
      // get the last bits 
      array320bits[i++] |= ((tmp & tmask) << (bit_size - rem)); 
      // shift off and adjust 
      tmp = tmp >> rem; 
      rem = (10 - rem); 
      // new mask 
      tmask = 0; 
      for (z = 0; z < rem; ++z) { tmask |= (1 << z); } 
      array320bits[i] = (tmp & tmask); 

      tot += rem; // increase shift left by remainder bits 
      tmp = tmp >> (rem + 6); // shift off 2 bytes 
      stot += 16; 
      excess = false; 
     } 
    } 
} 

template < std::size_t X, std::size_t Y > 
static void pack512to320_manual_loop(const uint64 (&array512bits)[X], uint64 (&array320bits)[Y]) 
{ 
    if (((X == 0) || (X % 8 != 0)) || ((Y == 0) || (Y % 5 != 0)) || ((X < Y) || (Y % X != Y))) { 
     // handle invalid sizes how you need here 
     std::cerr << "Invalid sizes!" << std::endl; 
     return; 
    } 
    uint64* a320 = array320bits; 
    const uint64* end = array512bits + (X-1); 
    for (const uint64* a512 = array512bits; a512 < end; a512 += 8) { 
     *a320 = (
      (a512[0] & 0x00000000000003FF)   | // 10 -> 10 
      ((a512[0] & 0x0000000003FF0000) >> 6) | // 10 -> 20 
      ((a512[0] & 0x000003FF00000000) >> 12) | // 10 -> 30 
      ((a512[0] & 0x03FF000000000000) >> 18) | // 10 -> 40 
      ((a512[1] & 0x00000000000003FF) << 40) | // 10 -> 50 
      ((a512[1] & 0x0000000003FF0000) << 34) | // 10 -> 60 
      ((a512[1] & 0x0000000F00000000) << 28)); // 4 -> 64 
     ++a320; 

     *a320 = (
      ((a512[1] & 0x000003F000000000) >> 36) | // 6 -> 6 
      ((a512[1] & 0x03FF000000000000) >> 42) | // 10 -> 16 
      ((a512[2] & 0x00000000000003FF) << 16) | // 10 -> 26 
      ((a512[2] & 0x0000000003FF0000) << 10) | // 10 -> 36 
      ((a512[2] & 0x000003FF00000000) << 4) | // 10 -> 46 
      ((a512[2] & 0x03FF000000000000) >> 2) | // 10 -> 56 
      ((a512[3] & 0x00000000000000FF) << 56)); // 8 -> 64 
     ++a320; 

     *a320 = (
      ((a512[3] & 0x0000000000000300) >> 8) | // 2 -> 2 
      ((a512[3] & 0x0000000003FF0000) >> 14) | // 10 -> 12 
      ((a512[3] & 0x000003FF00000000) >> 20) | // 10 -> 22 
      ((a512[3] & 0x03FF000000000000) >> 26) | // 10 -> 32 
      ((a512[4] & 0x00000000000003FF) << 32) | // 10 -> 42 
      ((a512[4] & 0x0000000003FF0000) << 26) | // 10 -> 52 
      ((a512[4] & 0x000003FF00000000) << 20) | // 10 -> 62 
      ((a512[4] & 0x0003000000000000) << 14)); // 2 -> 64 
     ++a320; 

     *a320 = (
      ((a512[4] & 0x03FC000000000000) >> 50) | // 8 -> 8 
      ((a512[5] & 0x00000000000003FF) << 8) | // 10 -> 18 
      ((a512[5] & 0x0000000003FF0000) << 2) | // 10 -> 28 
      ((a512[5] & 0x000003FF00000000) >> 4) | // 10 -> 38 
      ((a512[5] & 0x03FF000000000000) >> 10) | // 10 -> 48 
      ((a512[6] & 0x00000000000003FF) << 48) | // 10 -> 58 
      ((a512[6] & 0x00000000003F0000) << 42)); // 6 -> 64 
     ++a320; 

     *a320 = (
      ((a512[6] & 0x0000000003C00000) >> 22) | // 4 -> 4 
      ((a512[6] & 0x000003FF00000000) >> 28) | // 10 -> 14 
      ((a512[6] & 0x03FF000000000000) >> 34) | // 10 -> 24 
      ((a512[7] & 0x00000000000003FF) << 24) | // 10 -> 34 
      ((a512[7] & 0x0000000003FF0000) << 18) | // 10 -> 44 
      ((a512[7] & 0x000003FF00000000) << 12) | // 10 -> 54 
      ((a512[7] & 0x03FF000000000000) << 6)); // 10 -> 64 
     ++a320; 
    } 
} 

static void pack512to320_manual(uint64 (&a512)[8], uint64 (&a320)[5]) 
{ 
    a320[0] = (
     (a512[0] & 0x00000000000003FF)   | // 10 -> 10 
     ((a512[0] & 0x0000000003FF0000) >> 6) | // 10 -> 20 
     ((a512[0] & 0x000003FF00000000) >> 12) | // 10 -> 30 
     ((a512[0] & 0x03FF000000000000) >> 18) | // 10 -> 40 
     ((a512[1] & 0x00000000000003FF) << 40) | // 10 -> 50 
     ((a512[1] & 0x0000000003FF0000) << 34) | // 10 -> 60 
     ((a512[1] & 0x0000000F00000000) << 28)); // 4 -> 64 

    a320[1] = (
     ((a512[1] & 0x000003F000000000) >> 36) | // 6 -> 6 
     ((a512[1] & 0x03FF000000000000) >> 42) | // 10 -> 16 
     ((a512[2] & 0x00000000000003FF) << 16) | // 10 -> 26 
     ((a512[2] & 0x0000000003FF0000) << 10) | // 10 -> 36 
     ((a512[2] & 0x000003FF00000000) << 4) | // 10 -> 46 
     ((a512[2] & 0x03FF000000000000) >> 2) | // 10 -> 56 
     ((a512[3] & 0x00000000000000FF) << 56)); // 8 -> 64 

    a320[2] = (
     ((a512[3] & 0x0000000000000300) >> 8) | // 2 -> 2 
     ((a512[3] & 0x0000000003FF0000) >> 14) | // 10 -> 12 
     ((a512[3] & 0x000003FF00000000) >> 20) | // 10 -> 22 
     ((a512[3] & 0x03FF000000000000) >> 26) | // 10 -> 32 
     ((a512[4] & 0x00000000000003FF) << 32) | // 10 -> 42 
     ((a512[4] & 0x0000000003FF0000) << 26) | // 10 -> 52 
     ((a512[4] & 0x000003FF00000000) << 20) | // 10 -> 62 
     ((a512[4] & 0x0003000000000000) << 14)); // 2 -> 64 

    a320[3] = (
     ((a512[4] & 0x03FC000000000000) >> 50) | // 8 -> 8 
     ((a512[5] & 0x00000000000003FF) << 8) | // 10 -> 18 
     ((a512[5] & 0x0000000003FF0000) << 2) | // 10 -> 28 
     ((a512[5] & 0x000003FF00000000) >> 4) | // 10 -> 38 
     ((a512[5] & 0x03FF000000000000) >> 10) | // 10 -> 48 
     ((a512[6] & 0x00000000000003FF) << 48) | // 10 -> 58 
     ((a512[6] & 0x00000000003F0000) << 42)); // 6 -> 64 

    a320[4] = (
     ((a512[6] & 0x0000000003C00000) >> 22) | // 4 -> 4 
     ((a512[6] & 0x000003FF00000000) >> 28) | // 10 -> 14 
     ((a512[6] & 0x03FF000000000000) >> 34) | // 10 -> 24 
     ((a512[7] & 0x00000000000003FF) << 24) | // 10 -> 34 
     ((a512[7] & 0x0000000003FF0000) << 18) | // 10 -> 44 
     ((a512[7] & 0x000003FF00000000) << 12) | // 10 -> 54 
     ((a512[7] & 0x03FF000000000000) << 6)); // 10 -> 64 
} 

template < std::size_t N > 
static void printit(uint64 (&arr)[N]) 
{ 
    for (std::size_t i = 0; i < N; ++i) { 
     std::cout << "arr[" << i << "] = " << arr[i] << std::endl; 
    } 
} 

static double elapsed_us(timesruct init, timesruct end) 
{ 
    #if defined(_MSC_VER) 
     if (freq.LowPart == 0) { QueryPerformanceFrequency(&freq); } 
     return (static_cast<double>(((end.QuadPart - init.QuadPart) * 1000000))/static_cast<double>(freq.QuadPart)); 
    #else 
     return ((end.tv_sec - init.tv_sec) * 1000000) + (static_cast<double>((end.tv_nsec - init.tv_nsec))/1000); 
    #endif 
} 

int main(int argc, char* argv[]) 
{ 
    uint64 val = 0x039F039F039F039F; 
    uint64 a512[] = { val, val, val, val, val, val, val, val }; 
    uint64 a320[] = { 0, 0, 0, 0, 0 }; 
    int max_cnt = 1000000; 
    timesruct init, end; 
    std::cout << std::hex; 

    dotick(init); 
    for (int i = 0; i < max_cnt; ++i) { 
     pack512to320_loop(a512, a320); 
    } 
    dotick(end); 
    printit(a320); 
    // rough estimate of timing/divide by iterations 
    std::cout << "avg. us = " << (elapsed_us(init, end)/max_cnt) << " us" << std::endl; 

    dotick(init); 
    for (int i = 0; i < max_cnt; ++i) { 
     pack512to320_manual(a512, a320); 
    } 
    dotick(end); 
    printit(a320); 
    // rough estimate of timing/divide by iterations 
    std::cout << "avg. us = " << (elapsed_us(init, end)/max_cnt) << " us" << std::endl; 

    dotick(init); 
    for (int i = 0; i < max_cnt; ++i) { 
     pack512to320_manual_loop(a512, a320); 
    } 
    dotick(end); 
    printit(a320); 
    // rough estimate of timing/divide by iterations 
    std::cout << "avg. us = " << (elapsed_us(init, end)/max_cnt) << " us" << std::endl; 

    return 0; 
} 

再次,這只是普通的測試代碼和你的結果可能會有所不同。

希望能有所幫助。

相關問題