2013-02-07 248 views
1

我想將GCC彙編代碼轉換爲ARMASM彙編代碼任何人都可以請幫我這個。主要問題是.req .unreq .qn.dn。我想知道上述指令的等價物。我嘗試了別名,它沒有工作。轉換GCC彙編代碼爲armasm彙編代碼

.align 4 
.global ne10_fir_float_neon 
.extern ne10_qMaskTable32 
.thumb 
.thumb_func 

ne10_fir_float_neon: 
PUSH {r4-r12,lr} @push r12: 
to keep stack 8 bytes aligned 
@/*ARM Registers*/ 
pStateStruct  .req R0 
pSrc    .req R1 
pDst    .req R2 
blockSize  .req R3 

pState   .req R4    @/* State pointer */ 
pCoeffs   .req R5    @/* Coefficient pointer */ 
pStateCurnt  .req R6    @/* Points to the current sample of the state */ 

pX    .req R7    @/* Temporary pointers for state buffer */ 
pB    .req R8    @/* Temporary pointers for coefficient buffer */ 
numTaps   .req R9    @/* Length of the filter */ 

tapCnt   .req R10   @ /* Loop counter */ 
Count   .req R11   @ /* Loop counter */ 
pTemp   .req R11 
pMask   .req R14   @ /* Mask Table */ 

mask    .req R12 

@/*NEON variale Declaration*/ 
qInp    .qn Q0.F32 
dInp_0   .dn D0.F32 
dInp_1   .dn D1.F32 
qCoeff   .qn Q1.F32 
dCoeff_0   .dn D2.F32 
dCoeff_1   .dn D3.F32 
qZero   .qn Q2.F32 

qMask   .qn Q3.U32 
dMask_0   .dn D6.U32 
dMask_1   .dn D7.U32 
dOut_0   .dn D6.F32 
dOut_1   .dn D7.F32 

qAcc0   .qn Q8.F32 
dAcc0_0   .dn D16.F32 
dAcc0_1   .dn D17.F32 


qTemp   .qn Q9.F32 
dTemp_0   .dn D18.F32 
dTemp_1   .dn D19.F32 

qTemp1   .qn Q10.F32 
dTemp1_0   .dn D20.F32 
dTemp1_1   .dn D21.F32 
qTemp2   .qn Q11.F32 
qTemp3   .qn Q12.F32 
qMask1   .qn Q13.U32 
dMask1_0   .dn D26.U32 
dMask1_1   .dn D27.U32 
qMaskTmp   .qn Q14.U32 
dMaskTmp_0  .dn D28.U32 
dMaskTmp_1  .dn D29.U32 

qAcc1   .qn Q3.F32 
qAcc2   .qn Q13.F32 
qAcc3   .qn Q15.F32 




LDRH  numTaps,[pStateStruct],#4 
LDR   pState,[pStateStruct],#4 
LDR   pCoeffs,[pStateStruct],#4 

@/* S->state buffer contains previous frame (numTaps - 1) samples */ 
@/* pStateCurnt points to the location where the new input data should be written */ 
@/*pStateCurnt = &(S->state[(numTaps - 1u)])@*/ 
SUB   mask,numTaps,#1 
LDR   pMask,=ne10_qMaskTable32 
        AND   tapCnt,numTaps,#3 
        ADD   pStateCurnt,pState,mask,LSL #2 
        AND   mask,blockSize,#3 


        @/* Apply loop unrolling and compute 4 output values simultaneously. 
        @* The variables acc0 ... acc3 hold output values that are being computed: 
        @* 
        @* acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] 
        @* acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1] 
        @* acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2] 
        @* acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3] 
        @*/ 

        @/*If numTaps,blockSize are not multiples of 4, Get the appropriate Masks*/ 


        ADD   pTemp,pMask,tapCnt,LSL #4 
        VEOR  qZero,qZero 
        ADD   pX,pMask,mask,LSL #4 
        VLD1  {dMaskTmp_0,dMaskTmp_1},[pTemp] 
        VLD1  {dMask1_0,dMask1_1},[pX] 


        @/* Copy blockCnt number of new input samples into the state buffer */ 

        SUBS  blockSize,#4 
        BLT   firEndOuterLoop 

        @/* Compute 4 outputs at a time*/ 

        firOuterLoop: 

        VLD1  {dTemp_0,dTemp_1},[pSrc]! 
        MOV   pX,pState 
        MOV   pB,pCoeffs 
        @/* Read the first four samples from the state buffer: 
        @* x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2],x[n-numTaps-3] */ 

        VST1  {dTemp_0,dTemp_1},[pStateCurnt]! 
        @/* Zero the Accumulators*/ 
        VEOR  qAcc0,qAcc0 
        VLD1  {dInp_0,dInp_1},[pX]! 

        @//* Read the first four coefficients b[numTaps] to b[numTaps-3] */ 
        VLD1  {dCoeff_0,dCoeff_1},[pB]! 
        @/* Loop unrolling. Process 4 taps at a time. */ 
        SUBS  tapCnt,numTaps,#4 
        VLD1  {dTemp_0,dTemp_1},[pX]! 

        BLT   firEndInnerLoop 

        firInnerLoop: 
        VEXT  qTemp1,qInp,qTemp,#1 
        @/* acc0 += b[numTaps] * x[n-numTaps-1]+ b[numTaps] * x[n-numTaps-2] + 
        @* b[numTaps] * x[n-numTaps-3] + b[numTaps] * x[n-numTaps-4]*/ 
        VMLA  qAcc0,qInp,dCoeff_0[0] 
        @/* acc1 += b[numTaps-1] * x[n-numTaps-2]+ b[numTaps-1] * x[n-numTaps-3] + 
        @b[numTaps-1] * x[n-numTaps-4] +*b[numTaps-1] * x[n-numTaps-5]*/ 
        VMUL  qAcc1,qTemp1,dCoeff_0[1] 

        VEXT  qTemp2,qInp,qTemp,#2 
        @/* acc2 += b[numTaps-2] * x[n-numTaps-3]+ b[numTaps-2] * x[n-numTaps-4] + 
        @b[numTaps-2] * x[n-numTaps-5] + *b[numTaps-2] * x[n-numTaps-6]*/ 
        VMUL  qAcc2,qTemp2,dCoeff_1[0] 
        VADD  qAcc0, qAcc0, qAcc1 

        VEXT  qTemp3,qInp,qTemp,#3 
        @/* acc3 += b[numTaps-3] * x[n-numTaps-4]+ b[numTaps-3] * x[n-numTaps-5] + 
        @b[numTaps-3] * x[n-numTaps-6] +*b[numTaps-3] * x[n-numTaps-7] */ 
        VMUL  qAcc3,qTemp3,dCoeff_1[1] 
        VADD  qAcc0, qAcc0, qAcc2 

        VMOV  qInp,qTemp 
        VLD1  {dTemp_0,dTemp_1},[pX]! 
        VADD  qAcc0, qAcc0, qAcc3 

        SUBS  tapCnt,#4 
        @/* Read the b[numTaps-4] to b[numTaps-7] coefficients */ 
        VLD1  {dCoeff_0,dCoeff_1},[pB]! 


        BGE   firInnerLoop 
        firEndInnerLoop: 

        ADDS  tapCnt, tapCnt, #4 
        BEQ   firStoreOutput 

        @/* If the filter length is not a multiple of 4, compute the remaining filter taps */ 
        @/*Select only the remaining filter Taps*/ 
        VMOV  qMask,qMaskTmp 
        VBSL  qMask,qCoeff,qZero 
        VEXT  qTemp1,qInp,qTemp,#1 
        VMLA  qAcc0,qInp,dOut_0[0] 
        VEXT  qTemp2,qInp,qTemp,#2 
        VMLA  qAcc0,qTemp1,dOut_0[1] 
        VMLA  qAcc0,qTemp2,dOut_1[0] 

        firStoreOutput: 
        @/* Advance the state pointer by 4 to process the next group of 4 samples */ 
        ADD   pState,#16 

        @/* The results in the 4 accumulators are in 2.30 format. Convert to 1.31 
        @ * Then store the 4 outputs in the destination buffer. */ 
        SUBS  blockSize,#4 
        VST1  {dAcc0_0,dAcc0_1},[pDst]! 

        BGE   firOuterLoop 

        firEndOuterLoop: 
        @/*Handle BlockSize Not a Multiple of 4*/ 
        ADDS  blockSize,#4 
        BEQ   firCopyData 
        @/*Copy the Remaining BlockSize Number of Input Sample to state Buffer*/ 
        VMOV  qMask,qMask1 
        VLD1  {dTemp1_0,dTemp1_1},[pStateCurnt] 
        VLD1  {dTemp_0,dTemp_1},[pSrc] 

        ADD   pSrc,pSrc,blockSize,LSL #2 
        MOV   pX,pState 
        MOV   pB,pCoeffs 

        VBSL  qMask,qTemp,qTemp1 
        VST1  {dMask_0,dMask_1},[pStateCurnt] 
        VLD1  {dInp_0,dInp_1},[pX]! 

        ADD   pStateCurnt,pStateCurnt,blockSize, LSL #2 

        @/* Zero the Accumulators*/ 
        VEOR  qAcc0,qAcc0 
        VLD1  {dCoeff_0,dCoeff_1},[pB]! 
        SUBS  tapCnt,numTaps,#4 
        VLD1  {dTemp_0,dTemp_1},[pX]! 

        BLT   firEndInnerLoop1 

        firInnerLoop1: 

        VEXT  qTemp1,qInp,qTemp,#1 
        VMLA  qAcc0,qInp,dCoeff_0[0] 
        VEXT  qTemp2,qInp,qTemp,#2 
        VMLA  qAcc0,qTemp1,dCoeff_0[1] 
        VEXT  qTemp3,qInp,qTemp,#3 
        VMLA  qAcc0,qTemp2,dCoeff_1[0] 
        VMOV  qInp,qTemp 
        VMLA  qAcc0,qTemp3,dCoeff_1[1] 
        VLD1  {dCoeff_0,dCoeff_1},[pB]! 
        SUBS  tapCnt,#4 
        VLD1  {dTemp_0,dTemp_1},[pX]! 

        BGE   firInnerLoop1 
        firEndInnerLoop1: 


        VMOV  qMask,qMaskTmp 
        VBSL  qMask,qCoeff,qZero 
        VEXT  qTemp1,qInp,qTemp,#1 
        VMLA  qAcc0,qInp,dOut_0[0] 
        VEXT  qTemp2,qInp,qTemp,#2 
        VMLA  qAcc0,qTemp1,dOut_0[1] 
        VMLA  qAcc0,qTemp2,dOut_1[0] 
        VMOV  qMask,qMask1 
        VLD1  {dTemp_0,dTemp_1},[pDst] 


        @/* If the blockSize is not a multiple of 4, Mask the unwanted Output */ 

        VBSL  qMask,qAcc0,qTemp 
        VST1  {dMask_0,dMask_1},[pDst] 
        ADD   pDst,pDst,blockSize,LSL #2 
        ADD   pState,pState,blockSize,LSL #2 


        firCopyData: 
        @/* Processing is complete. Now shift the data in the state buffer down by 
        @** blockSize samples. This prepares the state buffer for the next function 
        @** call. */ 

        @/* Points to the start of the state buffer */ 

        SUB   numTaps,numTaps,#1 
        AND   mask,numTaps,#3 
        LDR   pStateCurnt,[pStateStruct,#-8] 
        ADD   pTemp,pMask,mask,LSL #4 
        VLD1  {dInp_0,dInp_1},[pState]! 
        VLD1  {dMask_0,dMask_1},[pTemp] 


        @/* copy data */ 

        SUBS  Count,numTaps,#4 
        BLT   firEnd 
        firCopyLoop: 
        VST1  {dInp_0,dInp_1},[pStateCurnt]! 
        SUBS  Count,#4 
        VLD1  {dInp_0,dInp_1},[pState]! 
        BGE   firCopyLoop 

        firEnd: 

        VLD1  {dTemp_0,dTemp_1},[pStateCurnt] 
        VBSL  qMask,qInp,qTemp 
        VST1  {dOut_0,dOut_1},[pStateCurnt] 
        ADD   pStateCurnt,pStateCurnt,mask, LSL #2 

        @/*Return From Function*/ 
        POP  {r4-r12,pc} 
        @/*ARM Registers*/ 
        .unreq pStateStruct 
        .unreq pSrc 
        .unreq pDst 
        .unreq blockSize 

        .unreq pState 
        .unreq pCoeffs 
        .unreq pStateCurnt 

        .unreq pX 
        .unreq pB 
        .unreq numTaps 

        .unreq tapCnt 
        .unreq Count 
        .unreq pTemp 
        .unreq pMask 

        .unreq mask 

        @/*NEON variale Declaration*/ 
        .unreq qInp 
        .unreq dInp_0 
        .unreq dInp_1 
        .unreq qCoeff 
        .unreq dCoeff_0 
        .unreq dCoeff_1 
        .unreq qZero 

        .unreq qMask 
        .unreq dMask_0 
        .unreq dMask_1 
        .unreq dOut_0 
        .unreq dOut_1 

        .unreq qAcc0 
        .unreq dAcc0_0 
        .unreq dAcc0_1 

        .unreq qTemp 
        .unreq dTemp_0 
        .unreq dTemp_1 

        .unreq qTemp1 
        .unreq dTemp1_0 
        .unreq dTemp1_1 
        .unreq qTemp2 
        .unreq qTemp3 
        .unreq qMask1 
        .unreq dMask1_0 
        .unreq dMask1_1 
        .unreq qMaskTmp 
        .unreq dMaskTmp_0 
        .unreq dMaskTmp_1 

        .unreq qAcc1 
        .unreq qAcc2 
        .unreq qAcc3 
        .end 
+1

@BoPersson:我認爲他在兩個彙編器之間轉換同一個cpu。 – Jester

回答

1

明白了.req與RN相同,並作了一些調整,並刪除非司令指令讓它工作!