我需要將內存中的float
大數組轉換爲double
的數組,然後返回。 Visual C++ 15更新3中是否有任何SSE編譯器內部函數有幫助?Float array double array and back,
編輯:它是兩種線格式之間的轉換,所以#define不會幫助。數據結構以浮點形式存儲,但第三方處理庫需要雙精度數組。
我需要將內存中的float
大數組轉換爲double
的數組,然後返回。 Visual C++ 15更新3中是否有任何SSE編譯器內部函數有幫助?Float array double array and back,
編輯:它是兩種線格式之間的轉換,所以#define不會幫助。數據結構以浮點形式存儲,但第三方處理庫需要雙精度數組。
您可以使用SSE此:
float
- >double
:_mm_cvtps_pd
double
- >float
:_mm_cvtpd_ps
首先嚐試一個簡單的標量循環,但正如(一)編譯器可向量化無論如何,(b)你可能會受到內存限制,所以SIMD優化可能沒有多大幫助。
這很有用,但它適用於128位XMM寄存器,不是嗎?也就是說,每個命令只有兩個值。那些使用YMM代替並處理4個值的指令是否有味道? –
是的,當然,請參閱AVX等價物[_mm256_cvtps_pd](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_pd&expand=1680,1680)和[_mm256_cvtpd_ps](https:// software。 intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ps&expand=1680,1680,1621),但請注意,由於上述原因,這些可能無法爲SSE版本帶來任何顯着改進。 –
數據結構存儲爲浮點數,但第三方處理庫需要double數組。
它可以處理緩存大小的塊嗎?
如果它沒有卡在第三方庫中,最好的辦法是進行轉換,從一對漂浮物中加載一對雙打和_mm_cvtps_pd
,同樣存儲回浮標,這樣你從來沒有在內存中的double
陣列。
但是,如果你不能做到這一點,你至少可以將數據反饋到庫,同時它還是在L1或L2緩存中的熱點閱讀一些花車和寫一些雙打後。實際上,如果它是一個「有線格式」,那麼大概數據必須首先通過CPU到內存的路徑,除非你有一個零拷貝接收API,DMA直接進入你的緩衝區。收到每個數據包時,轉換的理想位置可能會很小。無論是與轉換直接複製到double
,或複製到兩個float
和double
陣列,如果你還需要原始float
數據。
庫需要一個完整的數組。我沒有來源,我閱讀並隨時轉換。仍然,兩個爲一個的價格:) –
@SevaAlekseyev:是的,無論你可以做什麼,以儘量減少你的數據在緩存和RAM之間移動的次數。如果你的數組很大,那麼在轉換循環中使用NT商店可能是值得的。或者,也許同時寫入浮動和雙重,使用NT商店爲將被使用的第二個?但是,如果一切都適合L3,不要這樣做。 –
這是不是一個實際的回答你的問題,而只是一個例子如何使轉換隻ALU工作。如果您正確實施它,您可以將它與FPU cast並行以獲得更多速度。此解決方案應該100%IEEE兼容。
更新:我做這個慢,更具可讀性,但英特爾執行它的第三代酷睿i7的(到連南轉換爲二進制equale點)
#include <iostream>
#include <chrono>
#include <math.h>
void toDouble(float *inData, double *outData, int count)
{
if (count % 2)
{
std::cout << "Error count must be divided by 2" << std::endl;
return;
}
unsigned long long *pfData = (unsigned long long *)(inData);
unsigned long long *pdData = (unsigned long long *)(outData);
unsigned long long *pfDataEnd = pfData + count/2;
for (int i = 0; pfData<pfDataEnd; pfData++, pdData++, i += 2)
{
unsigned long long cl;
unsigned long long S1 = (*pfData & 0x80000000ull) << 32;
unsigned long long fE1 = (*pfData & 0x7F800000ull) << 32;
unsigned long long F1 = (*pfData & 0x007FFFFFull) << 29;
for (cl = 0; !fE1 && F1 && !(F1 & 0x7FF0000000000000ull); cl++)
F1 <<= 1;
if (cl > 0)
cl--;
unsigned long long dE1 = (fE1 == 0x7F80000000000000ull) ? 0x7FF0000000000000 : ((fE1 | F1) ? (fE1 >> 3) + 0x3800000000000000ull - cl * 0x0010000000000000ull : 0ull);
F1 &= 0x000FFFFFFFFFFFFFull;
*pdData = S1 | dE1 | F1;
pdData++;
unsigned long long S2 = *pfData & 0x8000000000000000ull;
unsigned long long fE2 = (*pfData & 0x7F80000000000000ull);
unsigned long long F2 = (*pfData & 0x007FFFFF00000000ull) >> 3;
for (cl = 0; !fE2 && F2 && !(F2 & 0x7FF0000000000000ull); cl++)
F2 <<= 1;
if (cl > 0)
cl--;
unsigned long long dE2 = (fE2==0x7F80000000000000ull) ? 0x7FF0000000000000 : ((fE2 | F2) ? (fE2 >> 3) + 0x3800000000000000ull - cl * 0x0010000000000000ull : 0ull);
F2 &= 0x000FFFFFFFFFFFFFull;
*pdData = S2 | dE2 | F2;
if (i == 126)
continue;
}
}
void toFloat(double *inData, float *outData, int count)
{
if (count % 2)
{
std::cout << "Error count must be divided by 2" << std::endl;
return;
}
unsigned long long *pdData = (unsigned long long *)(inData);
unsigned long long *pfData = (unsigned long long *)(outData);
unsigned long long *pfDataEnd = pfData + count/2;
for (int i=0; pfData<pfDataEnd; pfData++, pdData+=2,i+=2)
{
unsigned long long S1 = (*pdData & 0x8000000000000000ull);
unsigned long long dE1 = (*pdData & 0x7FF0000000000000ull);
unsigned long long fE1 = (dE1 <= 0x3800000000000000ull) ? 0ull : ((dE1 >= 0x4800000000000000ull) ? 0x0FF0000000000000ull : (dE1 - 0x3800000000000000ull));
unsigned long long F1 = (dE1 <= 0x3800000000000000ull) ? ((dE1 < 0x3600000000000000ull) ? 0ull : ((*pdData & 0x000FFFFFFFFFFFFFull | 0x0010000000000000ull) >> ((0x3800000000000000ull - dE1 >> 52) + 1))) : ((dE1 >= 0x47F0000000000000ull) ? (((dE1 == 0x7FF0000000000000ull) && (*pdData & 0x000FFFFFFFFFFFFFull)) ? 0x0008000000000000ull : 0ull) : (*pdData & 0x000FFFFFFFFFFFFFull));
F1 += (((F1 & 0x0000000010000000ull) && ((F1 & 0x0000000020000000ull) || (F1 & 0x000000000FFFFFFFull))) ? 0x0000000020000000ull : 0ull); //rounding
fE1 += F1 & 0x7FF0000000000000ull;
F1 &= 0x000FFFFFE0000000ull;
unsigned long long S2 = (*(pdData+1) & 0x8000000000000000ull);
unsigned long long dE2 = (*(pdData+1) & 0x7FF0000000000000ull);
unsigned long long fE2 = (dE2 <= 0x3800000000000000ull) ? 0ull : ((dE2 >= 0x4800000000000000ull) ? 0x0FF0000000000000ull : (dE2 - 0x3800000000000000ull));
unsigned long long F2 = (dE2 <= 0x3800000000000000ull) ? ((dE2 < 0x3600000000000000ull) ? 0ull : ((*(pdData + 1) & 0x000FFFFFFFFFFFFFull | 0x0010000000000000ull) >> ((0x3800000000000000ull - dE2 >> 52) + 1))) : ((dE2 >= 0x47F0000000000000ull) ? (((dE2 == 0x7FF0000000000000ull) && (*(pdData+1) & 0x000FFFFFFFFFFFFFull)) ? 0x0008000000000000ull : 0ull) : (*(pdData + 1) & 0x000FFFFFFFFFFFFFull));
F2 += (((F2 & 0x0000000010000000ull) && ((F2 & 0x0000000020000000ull) || (F2 & 0x000000000FFFFFFFull))) ? 0x0000000020000000ull : 0ull); //rounding
fE2 += F2 & 0x7FF0000000000000ull;
F2 &= 0x000FFFFFE0000000ull;
*pfData = S2 | ((fE2 | F2) << 3) | ((S1 | ((fE1 | F1) << 3)) >> 32);
if (i == 88)
continue;
}
}
int valTestFtoD(float *inData, double *outData, int count)
{
for (int i = 0; i < count; i++)
{
if ((((double)inData[i]) != outData[i]) && ((inData[i] == inData[i]) || (outData[i] == outData[i])))
return i;
}
return -1;
}
int valTestDtoF(double *inData, float*outData, int count)
{
for (int i = 0; i < count; i++)
{
if ((((float)inData[i]) != outData[i]) && ((inData[i] == inData[i]) || (outData[i] == outData[i])))
return i;
}
return -1;
}
void testFloatToDouble()
{
std::cout << "\nSTART Float to Double TEST\n";
int elemNum = 1024 * 1024 * 8;
float *f_arr = new float[elemNum];
double *d_arr = new double[elemNum];
auto start = std::chrono::steady_clock::now();
f_arr[0] = 2.0f;
for (int i = 1; i < elemNum; i++)
{
f_arr[i] = i/f_arr[i - 1];
d_arr[i] = 0.0f;
}
long long duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "init of floats and doubles done in " << duration << std::endl;
start = std::chrono::steady_clock::now();
for (int i = 0; i < elemNum; i++)
{
d_arr[i] = f_arr[i];
}
duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "cast to double done in " << duration << std::endl;
start = std::chrono::steady_clock::now();
float pi = 3.14159265358979323846;
float e = 2.71828182845904523536;
f_arr[0] = pi;
d_arr[0] = 0.0;
for (int i = 1; i < elemNum; i++)
{
f_arr[i] = (e + i)/f_arr[i - 1];
d_arr[i] = 0.0;
}
duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "init of floats and doubles done in " << duration << std::endl;
start = std::chrono::steady_clock::now();
toDouble(f_arr, d_arr, elemNum);
duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "toDouble done in " << duration << std::endl;
std::cout << "toDouble validation test ";
int errorPos = valTestFtoD(f_arr, d_arr, elemNum);
if (errorPos < 0)
std::cout << "OK" << std::endl;
else
{
std::cout << "FAIL at " << errorPos << std::endl;
std::cout << "float [" << errorPos << "]= " << f_arr[errorPos] << std::endl;
std::cout << "double[" << errorPos << "]= " << d_arr[errorPos] << std::endl;
}
delete[] f_arr;
delete[] d_arr;
std::cout << "END TEST\n";
}
void testDoubleToFloat()
{
std::cout << "\nSTART Double to Float TEST\n";
int elemNum = 1024 *1024 * 8;
float *f_arr = new float[elemNum];
double *d_arr = new double[elemNum];
auto start = std::chrono::steady_clock::now();
d_arr[0] = 2.0f;
for (int i = 1; i < elemNum; i++)
{
d_arr[i] = i/d_arr[i - 1];
f_arr[i] = 0.0f;
}
long long duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "init of floats and doubles done in " << duration << std::endl;
start = std::chrono::steady_clock::now();
for (int i = 0; i < elemNum; i++)
{
f_arr[i] = (float)d_arr[i];
}
duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "cast to float done in " << duration << std::endl;
start = std::chrono::steady_clock::now();
double pi = 3.14159265358979323846;
double e = 2.71828182845904523536;
d_arr[0] = pi;
f_arr[0] = 0.0f;
for (int i = 1; i < elemNum; i++)
{
d_arr[i] = (e+i)/d_arr[i - 1];
f_arr[i] = 0.0f;
}
duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "init of floats and doubles done in " << duration << std::endl;
start = std::chrono::steady_clock::now();
toFloat(d_arr, f_arr, elemNum);
duration = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - start).count();
std::cout << "toFloat done in " << duration << std::endl;
std::cout << "toFloat validation test ";
int errorPos = valTestDtoF(d_arr, f_arr, elemNum);
if (errorPos < 0)
std::cout << "OK" << std::endl;
else
{
std::cout << "FAIL at " << errorPos << std::endl;
std::cout << "double[" << errorPos << "]= " << d_arr[errorPos] << std::endl;
std::cout << "float[" << errorPos << "]= " << f_arr[errorPos] << std::endl;
}
delete[] f_arr;
delete[] d_arr;
std::cout << "END TEST\n";
}
int main()
{
testFloatToDouble();
testDoubleToFloat();
}
這是太多的ALU指令值得去做。 (https://godbolt.org/g/1EURvH)。我認爲,即使您手動將其從64位變爲128位(如打包轉換指令),也不值得花費任何週期來使用執行資源來代替FPU轉換。 'CVTPS2PD xmm,m64'在Haswell(端口0和一個加載端口)上是2個微處理器,每時鐘吞吐量爲一個。 ymm,m128版本也一樣。因此,Intel CPU可以儘可能快地進行轉換(每個時鐘一個矢量結果)。 (來源:http://agner.org/optimize/) –
將這個混合到一個加載/轉換/存儲循環只會降低它在x86上的速度。 –
@PeterCordes也許你是對的,因爲它比x64上的正常轉換慢2倍,正如我所說這不是一個解決方案,它只是一個例子,你可以做到這一點。我也沒有適當的技巧來在彙編級別優化它,但我知道你可以在單核上並行浮動和整數操作,這是我認爲他可以使用這種解決方案的基礎。 – Logman
'#IEEE兼容定義雙浮動「或反之亦然 – LogicStuff
無法爲您節省鑄造每一個值。 –
您是否希望在轉換回來時獲得輕微的損失?否則,只需轉換一種方式並保留原始文件以保存轉換。 –