我一直在使用altivec實現基本的數學運算,作爲學習即將到來的項目的simd的一種方式。此外,爲了看到它的性能優勢,我追蹤了執行操作需要多長時間,但我遇到了一些奇怪的事情。SIMD與Altivec:爲什麼兩個矢量乘兩個矢量比添加兩個矢量快?
我做的第一件事是將兩個向量加在一起並減去兩個向量。這工作正常。接下來我做的是將兩個向量放在一起。然而,乘法比加法更快,即使使用較少的時鐘週期來增加經文的乘積,這取決於我的特定CPU數據表中關於所用指令的說法。
我有兩個數組,其每個10MBs大,通過這兩個程序運行它們:
void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size/(sizeof(__vector int32_t)/sizeof(int32_t));
__vector int32_t* tempA = (__vector int32_t *) intArrayA;
__vector int32_t* tempB = (__vector int32_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
for(int i = 0; i < iterations; i++)
{
__vector int32_t sum = vec_add(*tempA, *tempB);
vec_st(sum, 0, tempOut);
tempA++;
tempB++;
tempOut++;
}
}
void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size/(sizeof(__vector int16_t)/sizeof(int16_t));
__vector int16_t* tempA = (__vector int16_t *) intArrayA;
__vector int16_t* tempB = (__vector int16_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
for(int i = 0; i < iterations; i++)
{
__vector int32_t productEven = vec_mule(*tempA, *tempB);
__vector int32_t productOdd = vec_mulo(*tempA, *tempB);
__vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
__vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);
vec_st(mergedProductHigh, 0, tempOut);
tempOut++;
vec_st(mergedProductLow, 0, tempOut);
tempA++;
tempB++;
tempOut++;
}
}
在我的特殊平臺,av_AddValues需要81ms來處理和av_MultiplyValues發生過48ms處理。 (使用std :: chrono :: high_resolution_clock記錄的時間)
爲什麼乘法需要較少的時間來處理而不是添加?
我不認爲增加32位值與乘以16位值會產生差異,因爲__vector類型總是處理16個字節的數據。
我的第一個想法是,因爲將數字加在一起是一件很簡單的任務,所以CPU完成操作的速度比從內存中獲取數據要快。而乘法運算,這種提取延遲隱藏在CPU忙於工作的事實中,而且不需要等待很長時間。
這是一個正確的假設嗎?
全碼:
#include <chrono>
#include <random>
#include <limits>
#include <iostream>
#include <cassert>
#include <cstring>
#include <cstdint>
#include <malloc.h>
#include <altivec.h>
#undef vector
void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size);
void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size);
void TestAdd();
void TestMultiply();
void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size);
void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size);
int main()
{
TestAdd();
TestMultiply();
}
void GenerateRandom16bitValues(int16_t* inputABuffer, int16_t* inputBBuffer, int32_t* outputBuffer, int size)
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(std::numeric_limits<int16_t>::min(), std::numeric_limits<int16_t>::max());
for(int i = 0; i < size; i++)
{
inputABuffer[i] = dis(gen);
inputBBuffer[i] = dis(gen);
outputBuffer[i] = 0;
}
}
void GenerateRandom32bitValues(int32_t* inputABuffer, int32_t* inputBBuffer, int32_t* outputBuffer, int size)
{
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dis(std::numeric_limits<int32_t>::min(), std::numeric_limits<int32_t>::max());
for(int i = 0; i < size; i++)
{
inputABuffer[i] = dis(gen);
inputBBuffer[i] = dis(gen);
outputBuffer[i] = 0;
}
}
void TestAdd()
{
int size = 10'485'760;
int bytes = size * sizeof(int32_t);
int32_t* inputABuffer = (int32_t*) memalign(64, bytes);
int32_t* inputBBuffer = (int32_t*) memalign(64, bytes);
int32_t* outputBuffer = (int32_t*) memalign(64, bytes);
assert(inputABuffer != nullptr);
assert(inputBBuffer != nullptr);
assert(outputBuffer != nullptr);
GenerateRandom32bitValues(inputABuffer, inputBBuffer, outputBuffer, size);
for(int i = 0; i < 20; i++)
{
auto start = std::chrono::high_resolution_clock::now();
av_AddValues(inputABuffer, inputBBuffer, outputBuffer, size);
auto end = std::chrono::high_resolution_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for(int k = 0; k < size; k++)
{
assert(outputBuffer[k] == (inputABuffer[k] + inputBBuffer[k]));
}
std::cout << "Vector Sum - " << diff.count() << "ms\n";
memset(outputBuffer, 0, size);
}
}
void TestMultiply()
{
int size = 10'485'760;
int16_t* inputABuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int16_t* inputBBuffer = (int16_t*) memalign(64, size * sizeof(int16_t));
int32_t* outputBuffer = (int32_t*) memalign(64, size * sizeof(int32_t));
assert(inputABuffer != nullptr);
assert(inputBBuffer != nullptr);
assert(outputBuffer != nullptr);
GenerateRandom16bitValues(inputABuffer, inputBBuffer, outputBuffer, size);
for(int i = 0; i < 20; i++)
{
auto start = std::chrono::high_resolution_clock::now();
av_MultiplyValues(inputABuffer, inputBBuffer, outputBuffer, size);
auto end = std::chrono::high_resolution_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
for(int k = 0; k < size; k++)
{
assert(outputBuffer[k] == (inputABuffer[k] * inputBBuffer[k]));
}
std::cout << "Vector product - " << diff.count() << "ms\n";
memset(outputBuffer, 0, size);
}
}
void av_AddValues(int32_t* intArrayA, int32_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size/(sizeof(__vector int32_t)/sizeof(int32_t));
__vector int32_t* tempA = (__vector int32_t *) intArrayA;
__vector int32_t* tempB = (__vector int32_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
for(int i = 0; i < iterations; i++)
{
__vector int32_t sum = vec_add(*tempA, *tempB);
vec_st(sum, 0, tempOut);
tempA++;
tempB++;
tempOut++;
}
}
void av_MultiplyValues(int16_t* intArrayA, int16_t* intArrayB, int32_t* outputBuffer, int size)
{
int iterations = size/(sizeof(__vector int16_t)/sizeof(int16_t));
__vector int16_t* tempA = (__vector int16_t *) intArrayA;
__vector int16_t* tempB = (__vector int16_t *) intArrayB;
__vector int32_t* tempOut = (__vector int32_t *) outputBuffer;
for(int i = 0; i < iterations; i++)
{
__vector int32_t productEven = vec_mule(*tempA, *tempB);
__vector int32_t productOdd = vec_mulo(*tempA, *tempB);
__vector int32_t mergedProductHigh = vec_mergeh(productEven, productOdd);
__vector int32_t mergedProductLow = vec_mergel(productEven, productOdd);
vec_st(mergedProductHigh, 0, tempOut);
tempOut++;
vec_st(mergedProductLow, 0, tempOut);
tempA++;
tempB++;
tempOut++;
}
}
PERF的統計和記錄PERF輸出:
Adding
Performance counter stats for './alti':
2151.146080 task-clock (msec) # 0.999 CPUs utilized
9 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
30957 page-faults # 0.014 M/sec
3871497132 cycles # 1.800 GHz
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
1504538891 instructions # 0.39 insns per cycle
234038234 branches # 108.797 M/sec
687912 branch-misses # 0.29% of all branches
270305159 L1-dcache-loads # 125.656 M/sec
79819113 L1-dcache-load-misses # 29.53% of all L1-dcache hits
<not supported> LLC-loads
<not supported> LLC-load-misses
2.152697186 seconds time elapsed
CPU Utilization
76.04% alti alti [.] av_AddValues
Multiply
Performance counter stats for './alti':
1583.016640 task-clock (msec) # 0.999 CPUs utilized
4 context-switches # 0.003 K/sec
0 cpu-migrations # 0.000 K/sec
20717 page-faults # 0.013 M/sec
2849050875 cycles # 1.800 GHz
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
1520409634 instructions # 0.53 insns per cycle
179185029 branches # 113.192 M/sec
535437 branch-misses # 0.30% of all branches
205341530 L1-dcache-loads # 129.715 M/sec
27124936 L1-dcache-load-misses # 13.21% of all L1-dcache hits
<not supported> LLC-loads
<not supported> LLC-load-misses
1.584145737 seconds time elapsed
CPU Utilization
60.35% alti alti [.] av_MultiplyValues
*您如何測量?你經常測量這個數據?你在執行兩個測試?發佈[MCVE] – EOF
那些時代看起來非常高 - 您是否在編譯時啓用了優化(例如'-O3')?此外,您使用的是什麼CPU,時鐘速度是多少? –
@eof我編輯了我的帖子,其中包含一個工作示例。起初,我只跑了一次,但我現在通過我測量的兩個例程並且時間一致。添加需要81ms,乘數需要48ms。如我的帖子所述,我只是使用std :: chrono :: high_resolution_clock來衡量時間。有更好的選擇嗎? – shaboinkin