7
我在一個緊密循環中實現雙線性插值並試圖用SSE優化它,但是我從它得到零加速。SSE雙線性插值
下面是代碼,非SIMD版本使用其可以與實現乘法和加法運算符被定義爲struct Vec3f { float x, y, z; }
一個簡單的矢量的結構:
#ifdef USE_SIMD
const Color c11 = pixelCache[y1 * size.x + x1];
const Color c12 = pixelCache[y2 * size.x + x1];
const Color c22 = pixelCache[y2 * size.x + x2];
const Color c21 = pixelCache[y1 * size.x + x2];
__declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() };
__declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() };
__declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() };
__declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() };
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11};
__declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12};
__declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22};
__declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21};
__asm {
movaps xmm0, mc11
movaps xmm1, mc12
movaps xmm2, mc22
movaps xmm3, mc21
movaps xmm4, ms11
movaps xmm5, ms12
movaps xmm6, ms22
movaps xmm7, ms21
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movaps mc11, xmm0
}
#else
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
const Vec3f colour =
c11*(x2-x)*(y2-y) +
c21*(x-x1)*(y2-y) +
c12*(x2-x)*(y-y1) +
c22*(x-x1)*(y-y1);
#endif
重新排列彙編代碼重用寄存器(結束了只有三個xmm寄存器)沒有任何效果。我也嘗試使用內在函數:
// perform bilinear interpolation
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r);
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r);
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r);
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r);
__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11);
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12);
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22);
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21);
mc11 = _mm_mul_ps(mc11, ms11);
mc12 = _mm_mul_ps(mc12, ms12);
mc22 = _mm_mul_ps(mc22, ms22);
mc21 = _mm_mul_ps(mc21, ms21);
mc11 = _mm_add_ps(mc11, mc12);
mc11 = _mm_add_ps(mc11, mc22);
mc11 = _mm_add_ps(mc11, mc21);
Vec3f colour;
_mm_storeu_ps(colour.array, mc11);
並無濟於事。我錯過了什麼,或者在這裏獲得任何額外的速度是不可能的?
我討厭這樣說,但這絕對不是正確的方法。首先,你花這麼多工作只是填充矢量(這是純粹的開銷)。然後在計算結束時你有一個討厭的依賴鏈。但從根本上來說,主要問題是您正在使用數組結構打包。如果你對SIMD很認真,你應該考慮切換到數組結構。 – Mysticial
好吧,我瞭解了有關載體的信息,我會嘗試首先重新排列數據以尊重對齊。但是請您詳細說明「計算結束時的依賴鏈」嗎? – SimpleMan
你有3個添加,這取決於對方。所以他們都不能並行完成,因爲在開始下一個之前必須先完成。我看到你正在做某種減少 - 這是使用二叉樹縮減的最佳方式。現在你只能組合4個向量。所以無論如何重新排列它都沒有多大的收穫。但我懷疑從更大的角度來看,你實際上正在總結一組更大的數字。 – Mysticial