0
下面展開的子公司的執行產生了錯誤的結果,我設法觀察到只有元素b[0]
和b[2]
被計數,而b[1]
和b[3]
不是。Aarch64程序集LD2問題
#include <stdio.h>
int count_multiple_bits(unsigned long long *b, int size) {
unsigned long long *d = b;
int c;
__asm__("LD2 {v0.D, v1.D}[0], [%1], #16 \n\t"
"LD2 {v0.D, v1.D}[1], [%1] \n\t"
"CNT v0.16b, v0.16b \n\t"
"CNT v1.16b, v1.16b \n\t"
"UADDLV h2, v0.16b \n\t"
"UADDLV h2, v1.16b \n\t"
"UMOV %0, v2.d[0] \n\t"
: "+r"(c)
: "r"(d) : "v0", "v1", "v2");
return c;
}
int main(int argc, const char *argv[]) {
unsigned long long bits[] = { -1ull, -1ull, -1ull, -1ull };
printf("Test: %i\n", count_multiple_bits(bits, 4));
return 0;
}
這一次它一次計數2元正常工作:
int count_multiple_bits(unsigned long long *b, int size) {
unsigned long long *d = b;
int c;
__asm__("LD1 {v0.D}[0], [%1], #8 \n\t"
"LD1 {v0.D}[1], [%1] \n\t"
"CNT v0.16b, v0.16b \n\t"
"UADDLV h1, v0.16b \n\t"
"UMOV %0, v1.d[0] \n\t"
: "+r"(c)
: "r"(d) : "v0", "v1");
return c;
}
與所有其他條件相同,我猜負載是錯誤的,這裏的佈局我相信:
v0.D[0] = b[0]
v1.D[0] = b[1]
v0.D[1] = b[2]
v1.D[1] = b[3]
任何特別的原因使用對'LD2的'在一個LD1 {v0.2D,v1.2D},[%1],#32'上?無論如何,當你只是將它們混合在一起時,似乎沒有必要在單個通道上解交叉數據的麻煩。 – Notlikethat
哇,這樣好多了,我沒有意識到(沒有正確解讀docs語法)多謝'LD''的'2D' /'#32'調試! – arul