Gprof告訴我,我的計算繁重程序使用AP-Hash花費大部分時間(36%)散列。如何訪問SHA內在?
我不能減少通話計數,但我仍然希望使通話計數更快,我可以從c程序中撥打intrinsic SHA嗎?
我需要英特爾編譯器還是可以堅持使用gcc?
Gprof告訴我,我的計算繁重程序使用AP-Hash花費大部分時間(36%)散列。如何訪問SHA內在?
我不能減少通話計數,但我仍然希望使通話計數更快,我可以從c程序中撥打intrinsic SHA嗎?
我需要英特爾編譯器還是可以堅持使用gcc?
SHA instructions現在可在Goldmont architecture。它被髮布了九月左右,2016年按照Intel Intrinsics Guide,這些都是感興趣的內部函數:
__m128i _mm_sha1msg1_epu32 (__m128i a, __m128i b)
__m128i _mm_sha1msg2_epu32 (__m128i a, __m128i b)
__m128i _mm_sha1nexte_epu32 (__m128i a, __m128i b)
__m128i _mm_sha1rnds4_epu32 (__m128i a, __m128i b, const int func)
__m128i _mm_sha256msg1_epu32 (__m128i a, __m128i b)
__m128i _mm_sha256msg2_epu32 (__m128i a, __m128i b)
__m128i _mm_sha256rnds2_epu32 (__m128i a, __m128i b, __m128i k)
GCC 5.0及以上版本使內部函數始終可用於Function Specific Option Pragmas。但是,您需要Binutils 2.24。測試還顯示Clang 3.7和3.8支持內在函數。測試還顯示Visual Studio 2015可以使用它們,但VS2013無法編譯它們。
您可以通過查找宏__SHA__
來檢測Linux預處理器中SHA的可用性。如果它的原生處理器能夠使用它,則可以使用它。如果不是,您可以使用-msha
啓用它。
$ gcc -march=native -dM -E - </dev/null | egrep -i '(aes|rdrnd|rdseed|sha)'
#define __RDRND__ 1
#define __SHA__ 1
#define __RDSEED__ 1
#define __AES__ 1
使用SHA1的代碼如下所示。它基於英特爾的博客標題爲Intel® SHA Extensions。另一個參考實現可從miTLS project獲得。
以下的代碼基於Intel® SHA Extensions博客。該代碼適用於完整的SHA1塊,因此const uint32_t *data
爲64個字節。您將不得不爲最終塊添加填充並設置位長。
它在Celeron J3455上以每個字節約1.7個週期(cpb)運行。我相信Andy Polyakov的SHA1運行在1.5 cpb for OpenSSL左右。作爲參考,一個優化的C/C++實現將運行在大約9到10 cpb的地方。
static void SHA1_SHAEXT_Transform(uint32_t *state, const uint32_t *data)
{
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MASK, MSG0, MSG1, MSG2, MSG3;
// Load initial values
ABCD = _mm_loadu_si128((__m128i*) state);
E0 = _mm_set_epi32(state[4], 0, 0, 0);
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
// Save current hash
ABCD_SAVE = ABCD;
E0_SAVE = E0;
// Rounds 0-3
MSG0 = _mm_loadu_si128((__m128i*) data+0);
MSG0 = _mm_shuffle_epi8(MSG0, MASK);
E0 = _mm_add_epi32(E0, MSG0);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
// Rounds 4-7
MSG1 = _mm_loadu_si128((__m128i*) (data+4));
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
// Rounds 8-11
MSG2 = _mm_loadu_si128((__m128i*) (data+8));
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 12-15
MSG3 = _mm_loadu_si128((__m128i*) (data+12));
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 16-19
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 20-23
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 24-27
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 28-31
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 32-35
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 36-39
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 40-43
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 44-47
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 48-51
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 52-55
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 56-59
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
// Rounds 60-63
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
// Rounds 64-67
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
// Rounds 68-71
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG3 = _mm_xor_si128(MSG3, MSG1);
// Rounds 72-75
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
// Rounds 76-79
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
// Add values back to state
E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
// Save state
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
_mm_storeu_si128((__m128i*) state, ABCD);
*(state+4) = _mm_extract_epi32(E0, 3);
}
如果你可以告訴你的處理器通過尋找sha_ni
標誌支持Linux下的SHA擴展:
$ cat /proc/cpuinfo
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 92
model name : Intel(R) Celeron(R) CPU J3455 @ 1.50GHz
stepping : 9
microcode : 0x1a
cpu MHz : 799.987
cache size : 1024 KB
physical id : 0
siblings : 4
core id : 0
cpu cores : 4
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 21
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36
clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc
art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclm
ulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg cx16 xtpr pdcm sse4_1 sse4_2 x2apic mov
be popcnt tsc_deadline_timer aes xsave rdrand lahf_lm 3dnowprefetch intel_pt tpr_shadow vn
mi flexpriority ept vpid fsgsbase tsc_adjust smep erms mpx rdseed smap clflushopt sha_ni x
saveopt xsavec xgetbv1 xsaves dtherm ida arat pln pts
bugs : monitor
bogomips : 2995.20
clflush size : 64
cache_alignment : 64
address sizes : 39 bits physical, 48 bits virtual
power management:
...
另見Are there in x86 any instructions to accelerate SHA (SHA1/2/256/512) encoding?
你可以找到源對於英特爾SHA內在函數和ARMv8 SHA內在函數Noloader GitHub | SHA-Intrinsics。它們是C源文件,併爲SHA-1,SHA-224和SHA-256提供壓縮功能。基於內部的實現爲SHA-1增加了大約3倍到4倍的吞吐量,並且爲SHA-224和SHA-256增加了大約6倍到12倍的吞吐量。
除非您在英特爾工作,否則您還不能。 SHA擴展尚未包含在任何發佈的CPU上;預計它們將包含在英特爾的Skylake microarchitecture(預計到2015年或2016年)。
此外,AP散列函數可能已經快於加速SHA。您可能需要考慮其他方法,例如優化散列函數或緩存熱值的結果。
[SHA指令](https://en.wikipedia.org/wiki/Intel_SHA_extensions)現已發佈在[Goldmont架構](https://en.wikipedia.org/wiki/Goldmont)中。另請參閱[x86中是否有加速SHA(SHA1/2/256/512)編碼的指令?](http://stackoverflow.com/q/20692386)。 – jww
你真的需要密碼安全的哈希(如SHA-1)嗎?如果不是的話,那麼有一堆非常好的非加密散列函數,比如[xxhash](http://code.google.com/p/xxhash/)等等。 – claj
我認爲這主要是爲了商業展覽,但它可以更快,當然:) – claj