2013-02-02 96 views
7

我在一個緊密循環中實現雙線性插值並試圖用SSE優化它,但是我從它得到零加速。SSE雙線性插值

下面是代碼,非SIMD版本使用其可以與實現乘法和加法運算符被定義爲struct Vec3f { float x, y, z; }一個簡單的矢量的結構:

#ifdef USE_SIMD 
    const Color c11 = pixelCache[y1 * size.x + x1]; 
    const Color c12 = pixelCache[y2 * size.x + x1]; 
    const Color c22 = pixelCache[y2 * size.x + x2]; 
    const Color c21 = pixelCache[y1 * size.x + x2]; 

    __declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() }; 
    __declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() }; 
    __declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() }; 
    __declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() }; 

    // scalars in vector form for SSE 
    const float s11 = (x2-x)*(y2-y); 
    const float s12 = (x2-x)*(y-y1); 
    const float s22 = (x-x1)*(y-y1); 
    const float s21 = (x-x1)*(y2-y); 

    __declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11}; 
    __declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12}; 
    __declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22}; 
    __declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21}; 

    __asm { 
     movaps xmm0, mc11 
     movaps xmm1, mc12 
     movaps xmm2, mc22 
     movaps xmm3, mc21 

     movaps xmm4, ms11 
     movaps xmm5, ms12 
     movaps xmm6, ms22 
     movaps xmm7, ms21 

     mulps xmm0, xmm4 
     mulps xmm1, xmm5 
     mulps xmm2, xmm6 
     mulps xmm3, xmm7 

     addps xmm0, xmm1 
     addps xmm0, xmm2 
     addps xmm0, xmm3 

     movaps mc11, xmm0 
    } 
#else 
    const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]); 
    const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]); 
    const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]); 
    const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]); 

    const Vec3f colour = 
      c11*(x2-x)*(y2-y) + 
      c21*(x-x1)*(y2-y) + 
      c12*(x2-x)*(y-y1) + 
      c22*(x-x1)*(y-y1); 
#endif 

重新排列彙編代碼重用寄存器(結束了只有三個xmm寄存器)沒有任何效果。我也嘗試使用內在函數:

// perform bilinear interpolation 
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]); 
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]); 
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]); 
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]); 

// scalars in vector form for SSE 
const float s11 = (x2-x)*(y2-y); 
const float s12 = (x2-x)*(y-y1); 
const float s22 = (x-x1)*(y-y1); 
const float s21 = (x-x1)*(y2-y); 

__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r); 
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r); 
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r); 
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r); 

__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11); 
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12); 
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22); 
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21); 

mc11 = _mm_mul_ps(mc11, ms11); 
mc12 = _mm_mul_ps(mc12, ms12); 
mc22 = _mm_mul_ps(mc22, ms22); 
mc21 = _mm_mul_ps(mc21, ms21); 

mc11 = _mm_add_ps(mc11, mc12); 
mc11 = _mm_add_ps(mc11, mc22); 
mc11 = _mm_add_ps(mc11, mc21); 

Vec3f colour; 
_mm_storeu_ps(colour.array, mc11); 

並無濟於事。我錯過了什麼,或者在這裏獲得任何額外的速度是不可能的?

+8

我討厭這樣說,但這絕對不是正確的方法。首先,你花這麼多工作只是填充矢量(這是純粹的開銷)。然後在計算結束時你有一個討厭的依賴鏈。但從根本上來說,主要問題是您正在使用數組結構打包。如果你對SIMD很認真,你應該考慮切換到數組結構。 – Mysticial

+0

好吧,我瞭解了有關載體的信息,我會嘗試首先重新排列數據以尊重對齊。但是請您詳細說明「計算結束時的依賴鏈」嗎? – SimpleMan

+7

你有3個添加,這取決於對方。所以他們都不能並行完成,因爲在開始下一個之前必須先完成。我看到你正在做某種減少 - 這是使用二叉樹縮減的最佳方式。現在你只能組合4個向量。所以無論如何重新排列它都沒有多大的收穫。但我懷疑從更大的角度來看,你實際上正在總結一組更大的數字。 – Mysticial

回答

6

爲什麼要浮點? 鑑於A,B,C,d和XERR包裝像素ARGB,yerr範圍0-256,一個簡單的例子是:

// ================================================================================================================= 
// xs_Bilerp 
// ================================================================================================================= 
finline uint32 xs_Bilerp (uint32 a, uint32 b, uint32 c, uint32 d, uint32 xerr, uint32 yerr) 
{ 
    #define xs_rbmask 0x00ff00ff 
    #define xs_agmask 0xff00ff00 

    if (a==b && c==d && a==d) return a; 

    const uint32 arb  = a & xs_rbmask; 
    const uint32 crb  = c & xs_rbmask; 
    const uint32 aag  = a & xs_agmask; 
    const uint32 cag  = c & xs_agmask; 

    const uint32 rbdx1  = (b & xs_rbmask) - arb; 
    const uint32 rbdx2  = (d & xs_rbmask) - crb; 
    const uint32 agdx1  = ((b & xs_agmask)>>8) - (aag >> 8); 
    const uint32 agdx2  = ((d & xs_agmask)>>8) - (cag >> 8); 

    const uint32 rb1  = (arb  + ((rbdx1 * xerr) >> 8)) & xs_rbmask; 
    const uint32 ag1  = (aag  + ((agdx1 * xerr) )) & xs_agmask; 
    const uint32 rbdy  = ((crb  + ((rbdx2 * xerr) >> 8)) & xs_rbmask)  - rb1; 
    const uint32 agdy  = (((cag + ((agdx2 * xerr) )) & xs_agmask)>>8) - (ag1 >> 8); 

    const uint32 rb   = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask; 
    const uint32 ag   = (ag1 + ((agdy * yerr) )) & xs_agmask; 

    return ag | rb; 
}