2017-08-06 62 views
3

我試圖在內核中嵌入ARMv8上的AES實現。我有一個C++實現,並且我有一個英特爾內核實現。ARM和Intel內部函數對AES使用相同的子密鑰計劃嗎?

實現應該是等效的,所以我試圖將英特爾用作ARMv8的藍圖。有一些差異,但他們是佔了。問題是,我得到了不同的結果。

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
#if defined(__ARM_FEATURE_CRYPTO) 

    uint8x16_t data = vld1q_u8(in); 

    // AES encryption with ARM intrinsics: 
    // rnds-1 (9 for AES128) cycles of AES: 
    // (Add, Shift, Sub) plus Mix Columns 
    unsigned int i; 
    for (i=0; i<rounds; ++i) 
    { 
     // AES single round encryption 
     data = vaeseq_u8(data, rdkeys[i]); 
     // AES mix columns 
     data = vaesmcq_u8(data); 
    } 
    // One round of encryption: AES, no Mix Columns 
    data = vaeseq_u8(data, rdkeys[i++]); 
    // Final Add (bitwise Xor) 
    data = veorq_u8(data, rdkeys[i]); 
    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 
    data = _mm_xor_si128(data, rdkeys[0]); 
    for (unsigned int i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 
    } 
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]); 
    data = _mm_aesenclast_si128(data, rdkeys[rounds]); 
    _mm_storeu_si128((__m128i*)out, data); 

#endif 
} 

在這一點上,我試圖側步子鍵計算。我在這兩種實現中使用相同的一組圓鍵:

#if defined(__ARM_FEATURE_CRYPTO) 
typedef uint8x16_t RoundKey; 
typedef uint8_t Byte; 
#elif defined(__AES__) 
typedef __m128i RoundKey; 
typedef uint8_t Byte; 
#endif 

// Avoid subkey scheduling at this point 
RoundKey rdkeys[ROUNDS+1]; 
for (size_t i=0; i<COUNTOF(rdkeys); ++i) 
    memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey)); 

但是,我得到了不同的結果。這裏的垃圾場產生什麼:

英特爾AES-NI

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 
... 
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 
Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 
... 
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 
Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA 
Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 
... 

Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 

ARMv8 AES

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 
... 
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 
Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 
... 
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 
Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA 
Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 
... 
Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9 

我一直在抓我的頭以上的結果。添加更多的printf並不能幫助識別問題。我開始認爲英特爾和ARM內部函數使用不同的子密鑰計劃。

ARM和Intel內部函數對AES使用相同的子密鑰計劃嗎?


下面的圖片是從paper by Cynthia Crutchfield。它檢查了英特爾內在函數和ARM內在函數的映射。

enter image description here


下面是完整的程序。列出了它們的命令行。

英特爾

g++ -Wall -maes aes-test.cxx -o aes-test.exe 

AEMv8

g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe 

計劃

#include <stdio.h> 
#include <stdlib.h> 
#include <stdint.h> 
#include <string.h> 

#if defined(__ARM_FEATURE_CRYPTO) 
# include <arm_neon.h> 
# include <arm_acle.h> 
#elif defined(__AES__) 
# include <wmmintrin.h> 
# include <emmintrin.h> 
#endif 

#if defined(__ARM_FEATURE_CRYPTO) 
typedef uint8x16_t RoundKey; 
typedef uint8_t Byte; 
#elif defined(__AES__) 
typedef __m128i RoundKey; 
typedef uint8_t Byte; 
#endif 

#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0])) 

static const unsigned int ROUNDS=10; 
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds); 
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds); 

void Print(const char* label, const Byte *in, size_t len, bool lf=false) 
{ 
    if (label) 
     printf("%s: ", label); 

    for (size_t i=0; in && i<len; ++i) 
     printf("%02X ", in[i]);  
    printf("\n"); 

    if (lf) 
     printf("\n"); 
} 

int main(int argc, char* argv[]) 
{ 
    Byte cipher[16], recover[16]; 
    const Byte plain[16] = { 
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF 
    }; 

    // Avoid subkey scheduling at this point 
    RoundKey rdkeys[ROUNDS+1]; 
    for (size_t i=0; i<COUNTOF(rdkeys); ++i) 
     memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i])); 

    AES_encrypt(plain, cipher, rdkeys, ROUNDS); 

    return 0; 
} 

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
    Print("In", in, 16); 

#if defined(__ARM_FEATURE_CRYPTO) 

    // Load the block 
    uint8x16_t data = vld1q_u8(in); 

    Print("Data (in)", (Byte*)&data, 16, true); 

    // AES encryption with ARM intrinsics: 
    // rnds-1 (9 for AES128) cycles of AES: 
    // (Add, Shift, Sub) plus Mix Columns 
    unsigned int i; 
    for (i=0; i<rounds; ++i) 
    { 
     // AES single round encryption 
     data = vaeseq_u8(data, rdkeys[i]); 
     // AES mix columns 
     data = vaesmcq_u8(data); 

     Print("Key", (Byte*)&rdkeys[i], 16); 
     Print("Data", (Byte*)&data, 16, true); 
    } 

    Print("Key", (Byte*)&rdkeys[i], 16); 

    // One round of encryption: AES, no Mix Columns 
    data = vaeseq_u8(data, rdkeys[i++]); 

    Print("Data", (Byte*)&data, 16, true); 

    // Final Add (bitwise Xor) 
    data = veorq_u8(data, rdkeys[i]); 

    Print("Data (xor)", (Byte*)&data, 16); 

    // Store the output data 
    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 

    Print("Data (in)", (Byte*)&data, 16); 

    data = _mm_xor_si128(data, rdkeys[0]); 

    Print("Key", (Byte*)&rdkeys[0], 16); 
    Print("Data (xor)", (Byte*)&data, 16, true); 

    for (unsigned int i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 

     Print("Key", (Byte*)&rdkeys[i], 16); 
     Print("Data", (Byte*)&data, 16, true); 
    } 
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]); 

    Print("Key", (Byte*)&rdkeys[rounds-1], 16); 
    Print("Data", (Byte*)&data, 16, true); 

    data = _mm_aesenclast_si128(data, rdkeys[rounds]); 

    Print("Key", (Byte*)&rdkeys[rounds], 16); 
    Print("Data", (Byte*)&data, 16, true); 

    _mm_storeu_si128((__m128i*)out, data); 

#endif 

    Print("Out", out, 16); 
} 
+1

我注意到在數據值的差異似乎是要麼0×90(90)和0xBE(190)...(如0xF9 - 0×69,0xC3 - 0x33,等等),也許這將有助於縮小問題的範圍...... ARM的外觀看起來有點奇怪。 –

回答

2

ARM和Intel內部函數對AES使用相同的子密鑰計劃嗎?

看來答案是肯定的。我仍然需要對真正的密鑰調度進行測試,但是我能夠使用相同的密鑰計劃對英特爾和ARMv8內在函數產生相同的結果。

看起來像在Crutchfield的參考實現中有一個偏移。它應該使用rounds-1,而不是rounds作爲循環控制。這意味着我正在測試ARMv8有11輪,而不是10.當ARMv8代碼生成F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9而不是F9 F9 ... F9 F9時,我應該懷疑它。

下面是更新後的代碼:

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
#if defined(__ARM_FEATURE_CRYPTO) 

    uint8x16_t data = vld1q_u8(in); 

    unsigned int i; 
    for (i=0; i<rounds-1; ++i) 
    { 
     data = vaeseq_u8(data, rdkeys[i]); 
     data = vaesmcq_u8(data); 
    } 

    data = vaeseq_u8(data, rdkeys[i++]); 
    data = veorq_u8(data, rdkeys[i]); 

    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 
    data = _mm_xor_si128(data, rdkeys[0]); 

    unsigned int i; 
    for (i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 
    } 

    data = _mm_aesenc_si128(data, rdkeys[i++]); 
    data = _mm_aesenclast_si128(data, rdkeys[i]); 
    _mm_storeu_si128((__m128i*)out, data); 

#endif 
} 
+0

因此,ARM代碼現在正在運行? –

+0

@MaartenBodewes - 是的,代碼現在正在運行。我在[修復ARMv8 AES加密](https://github.com/noloader/cryptopp/commit/701ec3aa1f45a754)和[修復ARMv8 AES解密](https://github.com/noloader/)上在我的測試分支中進行了檢查cryptopp /提交/ 14590423249d)。一旦通過測試,我會將其推送給Master。它的運行速度約爲2.8至3.0 cpb。它的速度比其他表現最佳的公司要慢1.0 cpb,但對於C++來說,它要比30-45 cpb要好。 – jww

相關問題