2015-05-29 79 views
2

我已經在cuda中實現了一個簡單的fft程序。這是內核功能:CUDA奇怪行爲訪問向量

__global__ 
void fftKernel(cuComplex* dev_samples, 
       size_t length, 
       size_t llog, 
       Direction direction) 
{ 
    int tid = threadIdx.x + blockDim.x * blockIdx.x; 
    if (tid < length/2) { 

     // First step, sorts data with bit reversing and compute new coefficients; 
     cuComplex c1 = dev_samples[bit_reverse(tid * 2) >> (32 - llog)]; 
     cuComplex c2 = dev_samples[bit_reverse(tid * 2 + 1) >> (32 - llog)]; 

     dev_samples[tid * 2] = cuCaddf(c1, c2); 
     dev_samples[tid * 2 + 1] = cuCsubf(c1, c2); 

     __syncthreads(); 

     int k = tid * 2; 
     int block_start = tid * 2; 

     // Butterfly propagation 
     for (int i = 1, n = 4; i < llog; i++, n *= 2) { 

      int new_block_start = (k/n) * n; 
      if (new_block_start != block_start) { 
       block_start = new_block_start; 
       k -= n/4; 
      } 

      // We compute 
      // X(k) = E(k) + TF(k, N) * O(k) 
      // and 
      // X(k + N/2) = E(k) - TF(k, N) * O(k) 
      c1 = dev_samples[k]; 
      c2 = cuCmulf(twiddle_factor(k % n, n, direction), dev_samples[k + n/2]); 

      dev_samples[k] = cuCaddf(c1, c2); 
      dev_samples[k + n/2] = cuCsubf(c1, c2); 

      __syncthreads(); 
     } 

     // Normalize if backward transforming 
     if (direction == Backward) { 
      dev_samples[tid].x /= length; 
      dev_samples[tid].y /= length; 
      dev_samples[tid + length/2].x /= length; 
      dev_samples[tid + length/2].y /= length; 
     } 


     // Strange behavior if using this snipset 
     // instead of the previous one 
     /* 
     if (direction == Backward) { 
      dev_samples[tid * 2].x /= length; 
      dev_samples[tid * 2].y /= length; 
      dev_samples[tid * 2 + 1].x /= length; 
      dev_samples[tid * 2 + 1].y /= length; 
     } */ 
    } 
} 

現在,如果我用這個代碼,我得到預期的輸出,即

(1.5, 0) 
(-0.5, -0.5) 
(-0.5, 0) 
(-0.5, 0.5) 

,其中長度= 4 但是,如果使用的註釋部分,我得到錯誤的結果

(1.5, 0) 
(-0.5, -0.5) 
(1, 0) <--- wrong 
(-0.5, 0.5) 

這很奇怪,因爲兩個版本之間的變化只是向量的訪問模式。 在第一種情況下,我訪問每個線程的長度/ 2分開的兩個元素。 在第二種情況下,我訪問兩個相鄰的元素。 這對我來說毫無意義......它可能是一個重大問題嗎?

編輯:與cuda-memcheck信號一起運行沒有錯誤和...錯誤並不顯示!

編輯:全(非)工作副本:

#include <iostream> 
#include <stdio.h> 
#include <cuda.h> 
#include <cuComplex.h> 
#include <math_constants.h> 

static void cuCheck(cudaError_t err, 
        const char *file, 
        int line) { 
    if (err != cudaSuccess) { 
    printf("%s in %s at line %d\n", cudaGetErrorString(err), 
      file, line); 
    exit(EXIT_FAILURE); 
    } 
} 
#define CU_CHECK(err) (cuCheck(err, __FILE__, __LINE__)) 

#define BLOCK_SIZE 512 

enum Direction { 
    Forward, 
    Backward 
}; 

/* Computes the in place fft of dev_samples. 
* Retrun true in the fft has successfully computed, false otherwise.*/ 
bool fft(cuComplex* dev_samples, size_t length, Direction direction); 

/* Returns true if n is a power of 2 and return the logarithm 
* of the greater number smaller than n that is a power of 2 
* in the variable pointed by exp */ 
__device__ __host__ 
bool isPowerOf2(int n, int* exp = NULL); 

/* Computes the twiddle factor */ 
__device__ 
cuComplex twiddle_factor(int k, size_t n, Direction direction); 
__device__ 
unsigned int bit_reverse(unsigned int i); 

__global__ 
void fftKernel(cuComplex* dev_samples, 
      size_t length, 
      size_t llog, 
      Direction direction); 

__device__ __host__ 
bool isPowerOf2(int n, int* exp) 
{ 
    int tmp_exp = 0; 
    int i; 
    for (i = 1; i < n; i *= 2, tmp_exp++); 
    if (exp) { 
    *exp = tmp_exp; 
    } 
    return (i == n); 
} 

bool fft(cuComplex* dev_samples, size_t length, Direction direction) 
{ 
    int llog; 
    if (!isPowerOf2(length, &llog)) 
    return false; 

    int gridSize = max((int)length/2/BLOCK_SIZE, (int)1); 
    fftKernel<<<BLOCK_SIZE, gridSize>>>(dev_samples, 
            length, 
            llog, 
            direction); 
    CU_CHECK(cudaDeviceSynchronize()); 


    return true; 
} 

__global__ 
void fftKernel(cuComplex* dev_samples, 
      size_t length, 
      size_t llog, 
      Direction direction) 
{ 
    int tid = threadIdx.x + blockDim.x * blockIdx.x; 
    if (tid < length/2) { 

    // First step, sorts data with bit reversing and compute new coefficients; 
    cuComplex c1 = dev_samples[bit_reverse(tid * 2) >> (32 - llog)]; 
    cuComplex c2 = dev_samples[bit_reverse(tid * 2 + 1) >> (32 - llog)]; 

    __syncthreads(); 

    dev_samples[tid * 2] = cuCaddf(c1, c2); 
    dev_samples[tid * 2 + 1] = cuCsubf(c1, c2); 

    __syncthreads(); 

    int k = tid * 2; 
    int block_start = tid * 2; 

    // Butterfly propagation 
    for (int i = 1, n = 4; i < llog; i++, n *= 2) { 

     int new_block_start = (k/n) * n; 
     if (new_block_start != block_start) { 
      block_start = new_block_start; 
      k -= n/4; 
     } 

     // We compute 
     // X(k) = E(k) + TF(k, N) * O(k) 
     // and 
     // X(k + N/2) = E(k) - TF(k, N) * O(k) 
     c1 = dev_samples[k]; 
     c2 = cuCmulf(twiddle_factor(k % n, n, direction), dev_samples[k + n/2]); 

     dev_samples[k] = cuCaddf(c1, c2); 
     dev_samples[k + n/2] = cuCsubf(c1, c2); 

     __syncthreads(); 
    } 

    //--------------------- ERROR HERE ----------------------- 
    // Normalize if backward transforming 
    if (direction == Backward) { 
     dev_samples[tid * 2].x /= length; 
     dev_samples[tid * 2].y /= length; 
     dev_samples[tid * 2+ 1].x /= length; 
     dev_samples[tid * 2+ 1].y /= length; 
    } 

    // This works!! 
    /*if (direction == Backward) { 
     dev_samples[tid * 2].x /= length; 
     dev_samples[tid * 2].y /= length; 
     dev_samples[tid * 2+ 1].x /= length; 
     dev_samples[tid * 2+ 1].y /= length; 
    }*/ 
     //--------------------------------------------------------- 
    } 
} 

__device__ 
cuComplex twiddle_factor(int k, size_t n, Direction direction) 
{ 
    if (direction == Forward) 
    return make_cuFloatComplex(cos(-2.0*CUDART_PI_F*k/n), 
           sin(-2.0*CUDART_PI_F*k/n)); 
    else 
    return make_cuFloatComplex(cos(2.0*CUDART_PI_F*k/n), 
           sin(2.0*CUDART_PI_F*k/n)); 
} 

__device__ 
unsigned int bit_reverse(unsigned int i) { 
    register unsigned int mask = 0x55555555; // 0101... 
    i = ((i & mask) << 1) | ((i >> 1) & mask); 
    mask = 0x33333333; // 0011... 
    i = ((i & mask) << 2) | ((i >> 2) & mask); 
    mask = 0x0f0f0f0f; // 00001111... 
    i = ((i & mask) << 4) | ((i >> 4) & mask); 
    mask = 0x00ff00ff; // 0000000011111111... 
    i = ((i & mask) << 8) | ((i >> 8) & mask); 
    // 00000000000000001111111111111111 no need for mask 
    i = (i << 16) | (i >> 16); 
    return i; 
} 

#define SIGNAL_LENGTH 4 

using namespace std; 

int main(int argc, char** argv) 
{ 
    size_t mem_size = SIGNAL_LENGTH*sizeof(cuComplex); 
    cuComplex* signal = (cuComplex*)malloc(mem_size); 
    cuComplex* dev_signal; 


    for (int i = 0; i < SIGNAL_LENGTH; i++) { 
    signal[i].x = i; 
    signal[i].y = 0; 
    } 

    CU_CHECK(cudaMalloc((void**)&dev_signal, mem_size)); 
    CU_CHECK(cudaMemcpy(dev_signal, signal, mem_size, cudaMemcpyHostToDevice)); 

    fft(dev_signal, SIGNAL_LENGTH, Backward); 

    CU_CHECK(cudaMemcpy(signal, dev_signal, mem_size, cudaMemcpyDeviceToHost)); 

    cout << "polito_Z_out:" << endl; 
    for (int i = 0; i < SIGNAL_LENGTH; i++) { 
    cout << " (" << signal[i].x << ", " << signal[i].y << ")" << endl; 
    } 

    cudaFree(dev_signal); 
    free(signal); 
} 
+1

您正在(可能)分叉代碼中使用'__syncthreads()'。目前尚不清楚這是否屬實,但如果這是未定義的行爲。完整的再現示例總是有用的。我強烈建議通過cuda-memcheck工具集來運行「buggy」內核(synccheck和racecheck工具非常有用)。 – Jez

+0

當你談論不同的代碼時,你是指** if(tid manu34

+0

這是正確的。我想你應該沒問題。你是否通過使用racecheck/synccheck的cuda-memcheck運行所有內容? – Jez

回答

1

我敢肯定,這是不正確的:

fftKernel<<<BLOCK_SIZE, gridSize>>>(...); 

第一個內核配置參數應該是網格維度。第二個應該是塊尺寸。

我能夠重現您在發佈的代碼中指明的錯誤。當我反轉這兩個參數時:

fftKernel<<<gridSize, BLOCK_SIZE>>>(...); 

錯誤消失了。

FWIW,synccheck工具確實報告了屏障錯誤,即使是上述「修復」。 synccheck是CUDA 7中的新功能,也是requires a cc3.5 or higher device

+0

你是對的!現在我只需要弄清楚爲什麼 如果我有512個1個線程塊的錯誤發生。 儘管它效率不高,它不應該一樣工作嗎? – manu34

+1

明白了! __syncthreads()只同步同一個線程塊內的線程! – manu34

0

認爲你可能需要另一個__syncthreads(),在這裏:

cuComplex c1 = dev_samples[bit_reverse(tid * 2) >> (32 - llog)]; 
    cuComplex c2 = dev_samples[bit_reverse(tid * 2 + 1) >> (32 - llog)]; 

    __syncthreads(); // <<< need this, otherwise possible race condition ? 

    dev_samples[tid * 2] = cuCaddf(c1, c2); 
    dev_samples[tid * 2 + 1] = cuCsubf(c1, c2); 
+0

這是一個問題,謝謝,但_the_問題依然存在 – manu34

+1

我猜你可能在代碼的其他地方有其他潛在的競爭條件,但訪問模式很難遵循,所以很難說對於某些。 –