2014-01-09 111 views
1

我有一個代碼片段。片段只是加載2個數組,並使用SSE計算它們之間的點積。_mm_load_ps導致段錯誤

代碼在這裏:

using namespace std; 

long long size = 3200000; 

float* _random() 
{ 
    unsigned int seed = 123; 
    // float *t = malloc(size*sizeof(float)); 
    float *t = new float[size]; 
    int i; 
    float num = 0.0; 
    for(i=0; i < size; i++) { 
     num = rand()/(RAND_MAX+1.0); 
     t[i] = num; 
    } 
    return t; 
} 

float _dotProductVectorSSE(float *s1, float *s2) 
{ 
    float prod; 
    int i; 
    __m128 X, Y, Z; 

    for(i=0; i<size; i+=4) 
    { 
     X = _mm_load_ps(&s1[i]); 
     Y = _mm_load_ps(&s2[i]); 
     X = _mm_mul_ps(X, Y); 
     Z = _mm_add_ps(X, Z); 
    } 

    float *v = new float[4]; 
    _mm_store_ps(v,Z); 

    for(i=0; i<4; i++) 
    { 
//  prod += Z[i]; 
     std::cout << v[i] << endl; 
    } 

    return prod; 
} 

int main(int argc, char *argv[]) 
{ 
    QCoreApplication a(argc, argv); 
    time_t start, stop; 
    double avg_time = 0; 
    double cur_time; 
    float* s1 = NULL; 
    float* s2 = NULL; 
    for(int i = 0; i < 100; i++) 
    { 
     s1 = _random(); 
     s2 = _random(); 
     start = clock(); 
     float sse_product = _dotProductVectorSSE(s1, s2); 
     stop = clock(); 
     cur_time = ((double) stop-start)/CLOCKS_PER_SEC; 
     avg_time += cur_time; 
    } 
    std::cout << "Averagely used " << avg_time/100 << " seconds." << endl; 
    return a.exec(); 
} 

當我跑,我有段故障。這裏是回溯:

(gdb) bt 
0 0x0804965f in _mm_load_ps (__P=0xb6b56008) at /usr/lib/gcc/i586-suse-linux/4.6/include/xmmintrin.h:899 
1 _dotProductVectorSSE (s1=0xb6b56008, s2=0xb5f20008) at ../simd/simd.cpp:37 
2 0x0804987f in main (argc=1, argv=0xbfffee84) at ../simd/simd.cpp:80 

Diassembler:

0x8049b30   push %ebp 
0x8049b31 <+0x0001>   push %edi 
0x8049b32 <+0x0002>   push %esi 
0x8049b33 <+0x0003>   push %ebx 
0x8049b34 <+0x0004>   sub $0x2c,%esp 
0x8049b37 <+0x0007>   mov 0x804c0a4,%esi 
0x8049b3d <+0x000d>   mov 0x40(%esp),%edx 
0x8049b41 <+0x0011>   mov 0x44(%esp),%ecx 
0x8049b45 <+0x0015>   mov 0x804c0a0,%ebx 
0x8049b4b <+0x001b>   cmp $0x0,%esi 
0x8049b4e <+0x001e>   jl  0x8049b7a <_Z20_dotProductVectorSSEPfS_+74> 
0x8049b50 <+0x0020>   jle 0x8049c10 <_Z20_dotProductVectorSSEPfS_+224> 
0x8049b56 <+0x0026>   add $0xffffffff,%ebx 
0x8049b59 <+0x0029>   adc $0xffffffff,%esi 
0x8049b5c <+0x002c>   xor %eax,%eax 
0x8049b5e <+0x002e>   shrd $0x2,%esi,%ebx 
0x8049b62 <+0x0032>   add $0x1,%ebx 
0x8049b65 <+0x0035>   shl $0x2,%ebx 
**0x8049b68 <+0x0038>   movaps (%edx,%eax,4),%xmm0** 
0x8049b6c <+0x003c>   mulps (%ecx,%eax,4),%xmm0 
0x8049b70 <+0x0040>   add $0x4,%eax 
0x8049b73 <+0x0043>   cmp %ebx,%eax 
0x8049b75 <+0x0045>   addps %xmm0,%xmm1 
0x8049b78 <+0x0048>   jne 0x8049b68 <_Z20_dotProductVectorSSEPfS_+56> 
0x8049b7a <+0x004a>   movaps %xmm1,0x10(%esp) 
0x8049b7f <+0x004f>   xor %ebx,%ebx 

我使用QtCreator和定義在.pro文件:

QMAKE_CXXFLAGS += -msse -msse2 
DEFINES += __SSE__ 
DEFINES += __SSE2__ 
DEFINES += __MMX__ 

請告訴我該怎麼解決這個問題!

回答

4

你是不是保證你的數據是16字節對齊(malloc/new不是一般足夠了) - 你要麼需要使用_mm_loadu_ps而不是_mm_load_ps處理你可能不對齊的數據,或者最好使用合適的方法分配對齊的內存(例如,Linux上的posix_memalign)。

請注意,如果可能的話,您應該使用_mm_load_ps和16字節對齊的內存,否則請使用_mm_loadu_ps但請注意,這可能會在某些(較舊的)CPU上顯着降低性能。

+0

嗨保羅,我是內部編程新手。我看看_mm_loadu_ps,並且知道地址不需要是16字節對齊的。但請告訴我,如何確保數據16字節對齊? –