2015-09-09 53 views
1

我知道轉換MMX 32位mmx內在函數不再允許__m64。所以我很難將這段代碼升級到SSE。我被告知另一個堆棧溢出帖子發佈我的代碼。也許這個練習也會幫助其他人。我做了什麼錯誤我的MMX Intrinsics轉換爲x64(SSE)?

我評論說'_mm_empty'認爲這是正確的做法。我在emmintrin.h中找到了所有其他__m128i操作的函數,但仍有一些錯誤。

原來的32位功能代碼:

DWORD CSumInsideHorizontalTask::InternalDoWork() 
{ 
    //////////////////////////////////////////////////////////// 
    // get local vars representing parameters from original call 
    ushort* arrayIn  = m_taskdata.arrayIn; 
    ushort arrayLen0 = m_taskdata.arrayLen0; 
    ushort arrayLen1 = m_taskdata.arrayLen1; 
    ushort* kernel  = m_taskdata.kernel; 
    ushort kernelLen = m_taskdata.kernelLen; 
    uint32_t* norm_r  = m_taskdata.norm_r; 
    ushort* outputArray = m_taskdata.outputArray; 

    ushort* interArray = m_taskdata.interArray; 
    //////////////////////////////////////////////////////////// 

    ushort tailLength = (ushort)((kernelLen - 1)/2); 

    _ASSERTE(interArray); 

    //ushort* pRow = NULL; // the current row 
    //ushort* pInterRow = NULL; // the current row in the interarray 

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic 
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1; 
    INT_PTR lpKernel; 

    // adjust for non-zero start 
    lpRow += m_nRowStart*rowstride; 
    lpInterRow += m_nRowStart*rowstride; 

    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge 
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength; 
    INT_PTR lpInterRowInside; // use this to work inside the edges 

    int h, i; 
    uint sum, points; 
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel 
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop 
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax; 

    // use this for MMX optimizations 
    int fourcount = kernelLen/4; 
    int remainingcount = kernelLen%4; 
    int mmxcount = 4*fourcount; // this is where the remainder is handled 
    int loopcount = 0; // use the for fast looping tests 

    _mm_empty(); 
    __m64 accu, temp; 
    __m64 shifter = _m_from_int(32); 

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row 
    { 
     // skip over left edge 
     lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

     for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges 
     { 
      sum = 0; 
      points = 0; 
      lpKernel = (INT_PTR)kernel; 

      lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row 

      // MMX Optimizations 
      accu = _mm_setzero_si64(); // zero the accumulator 

      // VECTOR processing 
      for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector 
      { 
       //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j)); 

       // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] 
       // _mm_add_pi32/_m_paddd: 2*32bit add 
       temp = _m_pmaddwd(*(__m64*)lpKernel, *(__m64*)lpInnerPixels); 

       accu = _mm_add_pi32(accu, temp); // each double word has a partial sum 

       lpKernel += 8; lpInnerPixels += 8; 

      } // loop over the kernel 

      // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 
      // and finally store the result into the variable "accu" 
      accu = _mm_add_pi32(accu, _mm_srl_si64(accu, shifter)); // combine results from upper and lower double words 

      sum = _m_to_int(accu); // move mmx result to sum 

      // SCALAR 
      for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector 
      { 
       //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j)); 
       sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels)); 
       //points++; 
       lpKernel += 2; lpInnerPixels += 2; 
      } // loop over the kernel 


      //*(interArray + h * arrayLen1 + i) = (ushort)(sum/*(norm_r + points - 1)); 

      *(ushort*)lpInterRowInside = (ushort)(sum/norm); 
      lpInterRowInside += 2; // move to next column sizeof(ushort) 
     } // for each column 


     lpRow += rowstride; // move to next row (h * arrayLen1) 
     lpInterRow += rowstride; 


    } // for each row 

    _mm_empty(); 

    return 0; 

} 

64位嘗試:

DWORD CSumInsideHorizontalTask::InternalDoWork() 
{ 
    //////////////////////////////////////////////////////////// 
    // get local vars representing parameters from original call 
    ushort* arrayIn  = m_taskdata.arrayIn; 
    ushort arrayLen0 = m_taskdata.arrayLen0; 
    ushort arrayLen1 = m_taskdata.arrayLen1; 
    ushort* kernel  = m_taskdata.kernel; 
    ushort kernelLen = m_taskdata.kernelLen; 
    uint32_t* norm_r  = m_taskdata.norm_r; 
    ushort* outputArray = m_taskdata.outputArray; 

    ushort* interArray = m_taskdata.interArray; 
    //////////////////////////////////////////////////////////// 

    ushort tailLength = (ushort)((kernelLen - 1)/2); 

    _ASSERTE(interArray); 

    //ushort* pRow = NULL; // the current row 
    //ushort* pInterRow = NULL; // the current row in the interarray 

    INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic 
    INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
    INT_PTR rowstride = sizeof(ushort)*arrayLen1; 
    INT_PTR lpKernel; 

    // adjust for non-zero start 
    lpRow += m_nRowStart*rowstride; 
    lpInterRow += m_nRowStart*rowstride; 


    // want to process only those (edge) pixels that need the innner loop condition 
    const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge 
    const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength; 
    INT_PTR lpInterRowInside; // use this to work inside the edges 

    int h, i; 
    uint sum, points; 
    uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel 
    INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop 
    INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax; 

    // use this for MMX optimizations 
    int fourcount = kernelLen/4; 
    int remainingcount = kernelLen%4; 
    int mmxcount = 4*fourcount; // this is where the remainder is handled 
    int loopcount = 0; // use the for fast looping tests 

    //_mm_empty(); 
    __m128i accu, temp; 
    __m128i shifter = _mm_cvtsi32_si128(32); 

    for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row 
    { 
     // skip over left edge 
     lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

     for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges 
     { 
      sum = 0; 
      points = 0; 
      lpKernel = (INT_PTR)kernel; 

      lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row 

      // MMX Optimizations 
      accu = _mm_setzero_si128(); // zero the accumulator 

      // VECTOR processing 
      for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector 
      { 
       //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j)); 

       // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] 
       // _mm_add_pi32/_m_paddd: 2*32bit add 
       //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels); 
       temp = _mm_madd_epi16(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels); 

       accu = _mm_add_epi32(accu, temp); // each double word has a partial sum 

       lpKernel += 8; lpInnerPixels += 8; 

      } // loop over the kernel 

      // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 
      // and finally store the result into the variable "accu" 
      accu = _mm_add_epi32(accu, _mm_sll_epi64(accu, shifter)); // combine results from upper and lower double words 

      sum = _mm_cvtsi128_si32(accu); // move mmx result to sum 

      // SCALAR 
      for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector 
      { 
       //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j)); 
       sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels)); 
       //points++; 
       lpKernel += 2; lpInnerPixels += 2; 
      } // loop over the kernel 


      //*(interArray + h * arrayLen1 + i) = (ushort)(sum/*(norm_r + points - 1)); 

      *(ushort*)lpInterRowInside = (ushort)(sum/norm); 
      lpInterRowInside += 2; // move to next column sizeof(ushort) 
     } // for each column 


     lpRow += rowstride; // move to next row (h * arrayLen1) 
     lpInterRow += rowstride; 


    } // for each row 

    //_mm_empty(); 

    return 0; 

} 
+1

如果我是你,我的第一步是做一個「試驗」項目中,我無非是想驗證加載,總結,存儲的等我的SSE註冊內部函數的行爲如預期。 – RyanP

+1

但是對於記錄'(__m128i *)lpKernel'將不起作用。實際上您需要使用對齊和/或未對齊的加載命令將對齊和未對齊內存中的內容加載到SSE寄存器中,然後才能對它們進行處理。 – RyanP

+0

你的意思是其中之一: @RyanP'extern __m128i _mm_load_si128(__ m128i const * _P); 'code'extern __m128i _mm_loadu_si128(__ m128i const * _P); extern __m128i _mm_loadl_epi64(__ m128i const * _P);' –

回答

1

利用上述評價中提到的固定的所有問題。 下面是最終工作的x64 SSE卷積碼:

DWORD CSumInsideHorizontalTask::InternalDoWork() 
{ 
//////////////////////////////////////////////////////////// 
// get local vars representing parameters from original call 
ushort* arrayIn  = m_taskdata.arrayIn; 
ushort arrayLen0 = m_taskdata.arrayLen0; 
ushort arrayLen1 = m_taskdata.arrayLen1; 
ushort* kernel  = m_taskdata.kernel; 
ushort kernelLen = m_taskdata.kernelLen; 
uint32_t* norm_r  = m_taskdata.norm_r; 
ushort* outputArray = m_taskdata.outputArray; 

ushort* interArray = m_taskdata.interArray; 
//////////////////////////////////////////////////////////// 

ushort tailLength = (ushort)((kernelLen - 1)/2); 

_ASSERTE(interArray); 

//ushort* pRow = NULL; // the current row 
//ushort* pInterRow = NULL; // the current row in the interarray 

INT_PTR lpRow = (INT_PTR)arrayIn; // for integer pointer arithmatic 
INT_PTR lpInterRow = (INT_PTR)interArray; // for integer pointer arithmatic 
INT_PTR rowstride = sizeof(ushort)*arrayLen1; 
INT_PTR lpKernel; 

// adjust for non-zero start 
lpRow += m_nRowStart*rowstride; 
lpInterRow += m_nRowStart*rowstride; 


// want to process only those (edge) pixels that need the innner loop condition 
const int knLeftEdgeMax = kernelLen - 1 - tailLength; // go from 0 to the end of the left edge 
const int knRightEdgeStart = arrayLen1 - kernelLen + 1 + tailLength; 
INT_PTR lpInterRowInside; // use this to work inside the edges 

int h, i; 
uint sum, points; 
uint32_t norm = norm_r[kernelLen-1]; // always process the full kernel 
INT_PTR lpInnerPixels; // use this to simplify the pointer math in the kernel loop 
INT_PTR cbLeftEdgeStride = 2*knLeftEdgeMax; 

// use this for MMX optimizations 
int fourcount = kernelLen/4; 
int remainingcount = kernelLen%4; 
int mmxcount = 4*fourcount; // this is where the remainder is handled 
int loopcount = 0; // use the for fast looping tests 

//_mm_empty(); 
__m128i accu, temp, mlpkernel, mlpInnerPixels; 
__m128i shifter = _mm_cvtsi32_si128(32); 

for (h=m_nRowStart; h < m_nRowEnd; h++) // for each row 
{ 
    // skip over left edge 
    lpInterRowInside = lpInterRow + cbLeftEdgeStride; 

    for (i = knLeftEdgeMax; i < knRightEdgeStart; i++) // for each inside the edges 
    { 
     sum = 0; 
     points = 0; 
     lpKernel = (INT_PTR)kernel; 

     lpInnerPixels = lpRow + ((i - tailLength)<<1); // this is where we start in the row 

     // MMX Optimizations 
     accu = _mm_setzero_si128(); // zero the accumulator 

     // VECTOR processing 
     for (loopcount = fourcount; loopcount != 0; loopcount--) // // for each kernel item that can be processed as a vector 
     { 
      //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j)); 

      // _m_pmaddwd: : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3] 
      // _mm_add_pi32/_m_paddd: 2*32bit add 
      //temp = _m_pmaddwd(*(__m128i*)lpKernel, *(__m128i*)lpInnerPixels); 
      //mlpkernel = _mm_cvtsi32_si128(lpKernel); 
      mlpkernel = _mm_cvtsi64_si128(*(__int64*)lpKernel); 
      mlpInnerPixels = _mm_cvtsi64_si128(*(__int64*)lpInnerPixels); 
      temp = _mm_madd_epi16(mlpkernel, mlpInnerPixels); 

      accu = _mm_add_epi32(accu, temp); // each double word has a partial sum 

      lpKernel += 8; lpInnerPixels += 8; 

     } // loop over the kernel 

     // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1 
     // and finally store the result into the variable "accu" 
     accu = _mm_add_epi32(accu, _mm_srl_epi64(accu, shifter)); // combine results from upper and lower double words 

     sum = _mm_cvtsi128_si32(accu); // move mmx result to sum 

     // SCALAR 
     for (loopcount = remainingcount; loopcount != 0; loopcount--) // for each kernel item that couldn't be processed as a vector 
     { 
      //sum += (uint)(*(kernel + j) * *(arrayIn + h * arrayLen1 + i - tailLength + j)); 
      sum += (uint)((*(ushort*)lpKernel) * *(ushort*)(lpInnerPixels)); 
      //points++; 
      lpKernel += 2; lpInnerPixels += 2; 
     } // loop over the kernel 


     //*(interArray + h * arrayLen1 + i) = (ushort)(sum/*(norm_r + points - 1)); 

     *(ushort*)lpInterRowInside = (ushort)(sum/norm); 
     lpInterRowInside += 2; // move to next column sizeof(ushort) 
    } // for each column 


    lpRow += rowstride; // move to next row (h * arrayLen1) 
    lpInterRow += rowstride; 


} // for each row 

//_mm_empty(); 

return 0; 

}