我正在嘗試使用SSE來進行矩陣乘法。我已經爲4x4矩陣編寫了一個簡單的程序。一切似乎都很好,但是當我打印結果時,它的一些垃圾值。請幫忙弄清楚問題。其次,程序在我釋放內存時停止工作,而不是程序的正確結束。使用SSE內在函數的矩陣乘法
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <float.h>
#include <xmmintrin.h>
void main() {
float **a, **b, **c;
int a_r = 4, a_c = 4, b_c = 4, b_r = 4;
int i, j, k;
/* allocate memory for matrix one */
a = (float **)malloc(sizeof(float) * a_r);
for (i = 0; i < a_c; i++) {
a[i] = (float *)malloc(sizeof(float) * a_c);
}
/* allocate memory for matrix two */
b = (float **)malloc(sizeof(float) * b_r);
for (i = 0; i < b_c; i++) {
b[i] = (float *)malloc(sizeof(float) * b_c);
}
/* allocate memory for sum matrix */
c = (float **)malloc(sizeof(float) * a_r);
for (i = 0; i < b_c; i++) {
c[i] = (float *)malloc(sizeof(float) * b_c);
}
printf("Initializing matrices...\n");
//initializing first matrix
for (i = 0; i < a_r; i++) {
for (j = 0; j < a_c; j++) {
a[i][j] = 2;
}
}
// initializing second matrix
for (i = 0; i < b_r; i++) {
for (j = 0; j < b_c; j++) {
b[i][j] = 2;
}
}
/* initialize product matrix */
for (i = 0; i < a_r; i++) {
for (j = 0; j < b_c; j++) {
c[i][j] = 0;
}
}
int count = 0;
/* multiply matrix one and matrix two */
for (i = 0; i < a_r; i++) {
for (j = 0; j < a_c; j++) {
count = 0;
__m128 result = _mm_setzero_ps();
for (k = 0; k < 4; k += 4) {
__m128 row1 = _mm_loadu_ps(&a[i][k]);
__m128 row2 = _mm_loadu_ps(&b[k][j]);
result = _mm_mul_ps(row1, row2);
for (int t = 1; t < 4; t++) {
__m128 row3=_mm_loadu_ps(&a[t * 4]);
__m128 row4=_mm_loadu_ps(&b[i][t]);
__m128 row5 = _mm_mul_ps(row3,row4);
result = _mm_add_ps(row5, result);
}
_mm_storeu_ps(&c[i][j], result);
}
}
}
printf("******************************************************\n");
printf ("Done.\n");
for (i = 0; i < a_r ; i++) {
for (j = 0; j < b_c; j++) {
printf ("%f ", c[i][j]); // issue here when I print results.
}
printf("\n");
} // Here program stops working.
/*free memory*/
for (i = 0; i < a_r; i++) {
free(a[i]);
}
free(a);
for (i = 0; i < a_c; i++) {
free(b[i]);
}
free(b);
for (i = 0; i < b_c; i++) {
free(c[i]);
}
free(c);
}
請看看輸出矩陣打印的地址。如何獲得對齊的地址,我有_aligned_malloc
,但仍然沒有對齊。
可能是因爲您分配數組不對齊* * – meowgoesthedog
@spug任何想法如何對齊或檢查aligment? – Sarmad
_stops working_是什麼意思?它崩潰了嗎?或凍結?當您在調試器中檢查它時會發生什麼?你使用什麼編譯器? – Useless