2016-06-10 54 views
1

我目前正在對ARM處理器上基於Linux的軟件本身進行優化。這些優化主要是ARM和ARM NEON功能的形式。但是,一旦我介紹了彙編函數,它們不會堆棧在調用它們的函數之上,而是看起來隨機的地方。然而,爲了剖析軟件,我使用了perf record和flame-graphs。ARM頭部以獲取適當的調用堆棧

因此,我的問題是,我應該在我的函數中包含哪些內容,以便它們在調用堆棧中正確顯示。

有一個稍微相關的主題,但沒有給出好的答案How to get call graph profiling working with gcc compiled code and ARM Cortex A8 target?。我使用相同的標誌加mapcs-frame。

下面我給出一個由GCC翻譯成ARM的C函數的例子。這ARM功能似乎產生體面的堆棧,但我想明白爲什麼。

int half(int in); 
int sum(int in1, int in2); 
int mean(int in1, int in2); 

int half(int i) 
{ 
    return i/2; 
} 

int sum(int i, int j) 
{ 
    return i + j; 
} 

int mean(int i, int j) 
{ 
    int s = sum(i, j); 
    int m = half(s); 
    return m; 
} 

int main() 
{ 
    int a = 1; 
    int b = 5; 
    int i; 
    int result; 
    for (i = 0; i<10000000; i++) { 
     result = mean(a, b); 
    } 
    return 0; 
} 
.cpu cortex-a9 
     .eabi_attribute 27, 3 
     .eabi_attribute 28, 1 
     .fpu neon 
     .eabi_attribute 20, 1 
     .eabi_attribute 21, 1 
     .eabi_attribute 23, 3 
     .eabi_attribute 24, 1 
     .eabi_attribute 25, 1 
     .eabi_attribute 26, 2 
     .eabi_attribute 30, 6 
     .eabi_attribute 34, 1 
     .eabi_attribute 18, 4 
     .file "a.c" 
     .text 
     .align 2 
     .global half 
     .type half, %function 
    half: 
     @ args = 0, pretend = 0, frame = 8 
     @ frame_needed = 1, uses_anonymous_args = 0 
     mov ip, sp 
     stmfd sp!, {fp, ip, lr, pc} 
     sub fp, ip, #4 
     sub sp, sp, #8 
     str r0, [fp, #-16] 
     ldr r3, [fp, #-16] 
     mov r2, r3, lsr #31 
     add r3, r2, r3 
     mov r3, r3, asr #1 
     mov r0, r3 
     sub sp, fp, #12 
     ldmfd sp, {fp, sp, pc} 
     .size half, .-half 
     .align 2 
     .global sum 
     .type sum, %function 
    sum: 
     @ args = 0, pretend = 0, frame = 8 
     @ frame_needed = 1, uses_anonymous_args = 0 
     mov ip, sp 
     stmfd sp!, {fp, ip, lr, pc} 
     sub fp, ip, #4 
     sub sp, sp, #8 
     str r0, [fp, #-16] 
     str r1, [fp, #-20] 
     ldr r2, [fp, #-16] 
     ldr r3, [fp, #-20] 
     add r3, r2, r3 
     mov r0, r3 
     sub sp, fp, #12 
     ldmfd sp, {fp, sp, pc} 
     .size sum, .-sum 
     .align 2 
     .global mean 
     .type mean, %function 
    mean: 
     @ args = 0, pretend = 0, frame = 16 
     @ frame_needed = 1, uses_anonymous_args = 0 
     mov ip, sp 
     stmfd sp!, {fp, ip, lr, pc} 
     sub fp, ip, #4 
     sub sp, sp, #16 
     str r0, [fp, #-24] 
     str r1, [fp, #-28] 
     ldr r1, [fp, #-28] 
     ldr r0, [fp, #-24] 
     bl sum 
     str r0, [fp, #-16] 
     ldr r0, [fp, #-16] 
     bl half 
     str r0, [fp, #-20] 
     ldr r3, [fp, #-20] 
     mov r0, r3 
     sub sp, fp, #12 
     ldmfd sp, {fp, sp, pc} 
     .size mean, .-mean 
     .align 2 
     .global main 
     .type main, %function 
    main: 
     @ args = 0, pretend = 0, frame = 16 
     @ frame_needed = 1, uses_anonymous_args = 0 
     mov ip, sp 
     stmfd sp!, {fp, ip, lr, pc} 
     sub fp, ip, #4 
     sub sp, sp, #16 
     mov r3, #1 
     str r3, [fp, #-20] 
     mov r3, #5 
     str r3, [fp, #-24] 
     mov r3, #0 
     str r3, [fp, #-16] 
     b .L8 
    .L9: 
     ldr r1, [fp, #-24] 
     ldr r0, [fp, #-20] 
     bl mean 
     str r0, [fp, #-28] 
     ldr r3, [fp, #-16] 
     add r3, r3, #1 
     str r3, [fp, #-16] 
    .L8: 
     ldr r2, [fp, #-16] 
     movw r3, #38527 
     movt r3, 152 
     cmp r2, r3 
     ble .L9 
     mov r3, #0 
     mov r0, r3 
     sub sp, fp, #12 
     ldmfd sp, {fp, sp, pc} 
     .size main, .-main 
     .ident "GCC: (crosstool-NG linaro-1.13.1-4.9-2014.09 - Linaro GCC 4.9-2014.09) 4.9.2 20140904 (prerelease)" 
     .section .note.GNU-stack,"",%progbits 

-------------------編輯------------------ -

下面是我嘗試整合的那種函數的例子。在連接方面,它所做的一切就是在開始時保存堆棧和鏈接寄存器並將其設置爲結束。我應該添加什麼?

.section .text 

.global ARM_smoothing 

ARM_smoothing: 
    STMFD  sp!, {r4-r12,lr} //move used registers on stack (avoid segmentation fault) 
    MOV   r5, r0    
    ADD   r0, r0, r2 
    ADD   r0, r0, r2 
    MOV   r8, r0 
    ADD   r8, r8, r2 
    ADD   r8, r8, r2  //the 6 instructions create 3 pointers to the row above and below as well as the current one 
    ADD   r1, r1, r2 
    ADD   r1, r1, r2  
    ADD   r1, r1, #2  //move destination pointer to first element (1 row down, 1 element left) 
    SUB   r2, r2, #2 
    SUB   r3, r3, #2  //counters decremented because smoothing function works with a margin of 1 on every side 
    LDR   r9, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9 
    LDR   r10, =0x2 
    LDR   r11, =0xC  //shifts for pointers to data 
    VLDR.U64 d20, =0x1C71C71D //(1/9)*2^32 pour effectuer la division par 9 
    VLDR.U64 d22, =0x0  //initialization of zeros to be used (not ncessarily needed) 
    VLDR.U64 d23, =0x0 
    VDUP.32  d20, d20[0]  //initialize vector for multiplication 
height_loop: 
    MOV   r4, r2   //reset width counter 
    CMP   r4, #8 
    BLGE  width_loop_eight_smoothing //use neon while more than 8 elements in row need smoothing 
    CMP   r4, #1 
    BLGE  width_loop_rest //use normal ARM for remaining elements, can't do in NEON because of margin 
    ADD   r0, r0, #4  //skip margin 
    ADD   r1, r1, #4 
    ADD   r5, r5, #4 
    ADD   r8, r8, #4 
    SUBS  r3, r3, #1  //decrement row counter 
    BNE   height_loop  //loop while there still are rows 
    LDMFD  sp!, {r4-r12,pc} //restore stack and return to calling function 


width_loop_eight_smoothing: 
    SUB   r4, r4, #8  //decrement width counter 
    VLD1.16  {d0, d1}, [r5], r10  //load upper left elements 
    VLD1.16  {d2, d3}, [r5], r10  //load upper middle elements 
    VADDL.S16 q2, d0, d2    //long addition of elements to be sure to not lose any data 
    VADDL.S16 q3, d1, d3    
    VLD1.16  {d0, d1}, [r5], r11  //load upper right elements  
    VLD1.16  {d2, d3}, [r0], r10  //load middle left elements 
    VADDL.S16 q4, d0, d2 
    VADDL.S16 q5, d1, d3 
    VADD.S32 q2, q4     //add to grand total 
    VADD.S32 q3, q5 
    VLD1.16  {d0, d1}, [r0], r10  //load current elements 
    VLD1.16  {d2, d3}, [r0], r11  //load middle right elements 
    VADDL.S16 q4, d0, d2 
    VADDL.S16 q5, d1, d3 
    VADD.S32 q2, q4 
    VADD.S32 q3, q5 
    VLD1.16  {d0, d1}, [r8], r10  //load lower left elements 
    VLD1.16  {d2, d3}, [r8], r10  //load lower middle elements 
    VADDL.S16 q4, d0, d2 
    VADDL.S16 q5, d1, d3 
    VADD.S32 q2, q4 
    VADD.S32 q3, q5 
    VLD1.16  {d0, d1}, [r8], r11  //load lower right elements 
    VADDL.S16 q4, d0, d22 
    VADDL.S16 q5, d1, d23 
    VADD.S32 q2, q4 
    VADD.S32 q3, q5 
    VMULL.S32 q6, d4, d20    //divide by 9 (upper element is total divided by 9) 
    VMULL.S32 q7, d5, d20 
    VMULL.S32 q8, d6, d20 
    VMULL.S32 q9, d7, d20 
    VUZP.32  q6, q7     //pack results into less registers and smaller elements 
    VUZP.32  q8, q9 
    VUZP.16  q7, q9 
    VSHR.U16 q8, q7, #15    //when multiplied element is negative, result is always one under 
    VADD.S16 q7, q8     //rectifying by adding sign bit to total 
    VST1.16  {d14, d15}, [r1]!  //store results 
    CMP   r4, #8     //check if theres enough elements to do 8 more in NEON   
    BCS   width_loop_eight_smoothing  //if yes, loop neon code 
    MOV   PC, LR     //return to ARM_smoothing if not 



width_loop_rest:      //works similaarly to NEON but one element at a time 
    LDRSH  r6, [r0], #2   //converts loaded half words to signed full words 
    LDRSH  r7, [r0]    //main difference is with the way increments are done since there is an overlap 
    ADD   r6, r7, r6 
    LDRSH  r7, [r0, #2] 
    ADD   r6, r7, r6 
    LDRSH  r7, [r5], #2 
    ADD   r6, r7, r6 
    LDRSH  r7, [r5] 
    ADD   r6, r7, r6 
    LDRSH  r7, [r5, #2] 
    ADD   r6, r7, r6 
    LDRSH  r7, [r8], #2 
    ADD   r6, r7, r6 
    LDRSH  r7, [r8] 
    ADD   r6, r7, r6 
    LDRSH  r7, [r8, #2] 
    ADD   r6, r7, r6 
    SMULLS  r6, r7, r6, r9 
    ADDMI  r7, #1 
    STRH  r7, [r1], #2 
    SUBS  r4, #1   //decrement width counter and check if there's any left 
    BNE   width_loop_rest 
    MOV   PC, LR 
+0

那麼你有什麼彙編函數的問題看起來像?你是否正在創建相關的APCS佈局堆棧框架? – Notlikethat

+0

值得注意的是,多年前APCS已被EABI所取代,'-mapcs-frame'已經被剝奪了相當長的一段時間,而GCC傢伙正在急於完全將其刪除。也就是說,FWIW [Rick Murray的網站](http://www.heyrick.co.uk/assembler/apcsintro.html)可能是舊事物最好的參考。 – Notlikethat

+0

我添加了一種編輯功能,我嘗試進行配置文件,我不認爲我正在創建堆棧幀。 – VictorC

回答

0

你可以清楚地看到,編譯器是如何標註一些僞操作彙編...

.global mean 
    .type mean, %function 

...

.size mean, .-mean 

這些投入COFF部分,並且需要對其進行編譯,以便調用圖工具可以知道您的彙編程序函數的範圍是PC

.global ARM_smoothing 
+ .type ARM_smoothing, %function 

...

+ .size ARM_smoothing, .-ARM_smoothing 

其他僞操作取決於所需的調試信息。

其他都.fnend.fnstart.movsp.save.setfp等。

它取決於工具所期望的調試/對象格式。還有兩種類型的數據;

  1. 代碼程度信息
  2. 堆棧和幀使用

兩者都通常需要用於展開(或疊層回溯),但採樣性能工具可能僅逃脫第一。執行對象清理的異常處理代碼需要最多的信息。

相關:ARM Link and frame register

+0

這不是勺子飼料的直接答案;您需要查閱GAS文檔頁面並查看編譯器的彙編程序輸出。檢查你的工具文檔,對象(COFF/ELF)也會有所幫助。希望這個*答案在指導許多人解決問題而不是針對某個特定實例時很有用。 –