2016-10-22 17 views
1

我正在使用cuda版本7.5 cufft來執行一些FFT和反FFT。 使用cufftExecC2R(.,.)函數執行逆FFT時,我遇到了問題。使用cuda從複數轉換爲實數時輸出錯誤cuFFT

其實,當我在cufftPlan1d(,)中使用batch_size = 1時,我得到了正確的結果。但是,當我增加批量大小時,結果不正確。

我正在粘貼一個示例代碼來說明這一點。我很快就創建了這個代碼,請忽略代碼的髒亂。

#include <cufft.h> 
#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 
#include <ctime> 
#include <iostream> 

typedef float2 Complex; 

void iTest(int argc, char** argv); 

#define SIGNAL_SIZE 9 
#define BATCH_SIZE 2 

int main(int argc, char** argv) { 

    iTest(argc, argv); 
    return 0; 

} 

void iProcess(Complex *x, double *y, size_t n) { 

    cufftComplex *deviceData; 
    cudaMalloc(reinterpret_cast<void**>(&deviceData), 
       SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex)); 
    cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE, 
       cudaMemcpyHostToDevice); 

    cufftResult cufftStatus; 
    cufftHandle handle; 
    cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE); 
    if (cufftStatus != cudaSuccess) { 
     printf("cufftPlan1d failed!"); 
    } 

    cufftComplex *d_complex; 
    cudaMalloc(reinterpret_cast<void**>(&d_complex), 
       sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE); 

    cufftStatus = cufftExecC2C(handle, deviceData, d_complex, CUFFT_FORWARD); 
    if (cufftStatus != cudaSuccess) { 
     printf("cufftExecR2C failed!"); 
    } 

    cufftComplex *hostOutputData = (cufftComplex*)malloc(
     (SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex)); 

    cudaMemcpy(hostOutputData, d_complex, 
       SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE, 
       cudaMemcpyDeviceToHost); 

    std::cout << "\nPrinting COMPLEX" << "\n"; 
    for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++) 
     printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y); 


    //! convert complex to real 

    cufftHandle c2r_handle; 
    cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE); 
    if (cufftStatus != cudaSuccess) { 
     printf("cufftPlan1d failed!"); 
    } 

    cufftReal *d_odata; 
    cudaMalloc(reinterpret_cast<void**>(&d_odata), 
       sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE); 
    cufftStatus = cufftExecC2R(c2r_handle, d_complex, d_odata); 

    cufftReal odata[SIGNAL_SIZE * BATCH_SIZE]; 
    cudaMemcpy(odata, d_odata, sizeof(cufftReal) * SIGNAL_SIZE * BATCH_SIZE, 
       cudaMemcpyDeviceToHost); 

    std::cout << "\nPrinting REAL" << "\n"; 
    for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) { 
     std::cout << i << " \t" << odata[i]/(SIGNAL_SIZE) << "\n"; 
    } 


    cufftDestroy(handle); 
    cudaFree(deviceData); 
} 

void iTest(int argc, char** argv) { 

    Complex* h_signal = reinterpret_cast<Complex*>(
     malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE)); 

    std::cout << "\nPrinting INPUT" << "\n"; 
    for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) { 
     h_signal[i].x = rand()/static_cast<float>(RAND_MAX); 
     h_signal[i].y = 0; 

     std::cout << i << "\t" << h_signal[i].x << "\n"; 
    } 
    std::cout << "\n"; 

    double y[SIGNAL_SIZE * BATCH_SIZE]; 
    iProcess(h_signal, y, 1); 

} 

我找不到我的代碼中的錯誤以及缺少哪些信息。使用BATCH_SIZE = 2 image 2

回答

5

,你缺失的信息時使用BATCH_SIZE = 1

Image 1

樣本輸出時

樣本輸出的是,你不明白,有對數據格式的差異輸入C2C轉換期望的數據與C2R(或R2C)。

您應該首先閱讀CUFFT文檔的this sectionthis section

注意,它說:

的這些功能都要求實現不同的輸入數據佈局

但你傳遞一個是正確的C2C直接轉變爲C2R變換輸入數據。這是行不通的。

IMO最直接的解決方案是將所有工作轉換爲C2C轉換類型。 C2C變換可以同時支持正向(例如「真實到複雜」)和反向(例如「複雜到真實」)。您正在使用的C2R變換類型也可以支持「複雜到實際」,但是您將用於C2R 的數據排列與您將用於C2C的數據排列的不同,具有指定的反向路徑,否則相同的變換。你沒有考慮到這一點。

這裏是表示代碼的修改版本,它使用C2C對於正向和反向路徑的工作實例,並正確地再現用於2的批量大小的輸入:

$ cat t19.cu 
#include <cufft.h> 
#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 
#include <ctime> 
#include <iostream> 

typedef float2 Complex; 

void iTest(int argc, char** argv); 

#define SIGNAL_SIZE 9 
#define BATCH_SIZE 2 

int main(int argc, char** argv) { 

    iTest(argc, argv); 
    return 0; 

} 

void iProcess(Complex *x, double *y, size_t n) { 

    cufftComplex *deviceData; 
    cudaMalloc(reinterpret_cast<void**>(&deviceData), 
       SIGNAL_SIZE * BATCH_SIZE * sizeof(cufftComplex)); 
    cudaMemcpy(deviceData, x, SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE, 
       cudaMemcpyHostToDevice); 

    cufftResult cufftStatus; 
    cufftHandle handle; 
    cufftStatus = cufftPlan1d(&handle, SIGNAL_SIZE, CUFFT_C2C, BATCH_SIZE); 
    if (cufftStatus != cudaSuccess) { 
     printf("cufftPlan1d failed!"); 
    } 

    cufftComplex *d_complex; 
    cudaMalloc(reinterpret_cast<void**>(&d_complex), 
       sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE); 

    cufftStatus = cufftExecC2C(handle, deviceData, d_complex, CUFFT_FORWARD); 
    if (cufftStatus != cudaSuccess) { 
     printf("cufftExecR2C failed!"); 
    } 

    cufftComplex *hostOutputData = (cufftComplex*)malloc(
     (SIGNAL_SIZE) * BATCH_SIZE * sizeof(cufftComplex)); 

    cudaMemcpy(hostOutputData, d_complex, 
       SIGNAL_SIZE * sizeof(cufftComplex) * BATCH_SIZE, 
       cudaMemcpyDeviceToHost); 

    std::cout << "\nPrinting COMPLEX" << "\n"; 
    for (int j = 0; j < (SIGNAL_SIZE) * BATCH_SIZE; j++) 
     printf("%i \t %f \t %f\n", j, hostOutputData[j].x, hostOutputData[j].y); 


    //! convert complex to real 

/* cufftHandle c2r_handle; 
    cufftStatus = cufftPlan1d(&c2r_handle, SIGNAL_SIZE, CUFFT_C2R, BATCH_SIZE); 
    if (cufftStatus != cudaSuccess) { 
     printf("cufftPlan1d failed!"); 
    } 
*/ 
    cufftComplex *d_odata; 
    cudaMalloc(reinterpret_cast<void**>(&d_odata), 
       sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE); 
    cufftStatus = cufftExecC2C(handle, d_complex, d_odata, CUFFT_INVERSE); 

    cufftComplex odata[SIGNAL_SIZE * BATCH_SIZE]; 
    cudaMemcpy(odata, d_odata, sizeof(cufftComplex) * SIGNAL_SIZE * BATCH_SIZE, 
       cudaMemcpyDeviceToHost); 

    std::cout << "\nPrinting REAL" << "\n"; 
    for (int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; i++) { 
     std::cout << i << " \t" << odata[i].x/(SIGNAL_SIZE) << "\n"; 
    } 


    cufftDestroy(handle); 
    cudaFree(deviceData); 
} 

void iTest(int argc, char** argv) { 

    Complex* h_signal = reinterpret_cast<Complex*>(
     malloc(sizeof(Complex) * SIGNAL_SIZE * BATCH_SIZE)); 

    std::cout << "\nPrinting INPUT" << "\n"; 
    for (unsigned int i = 0; i < SIGNAL_SIZE * BATCH_SIZE; ++i) { 
     h_signal[i].x = rand()/static_cast<float>(RAND_MAX); 
     h_signal[i].y = 0; 

     std::cout << i << "\t" << h_signal[i].x << "\n"; 
    } 
    std::cout << "\n"; 

    double y[SIGNAL_SIZE * BATCH_SIZE]; 
    iProcess(h_signal, y, 1); 

} 
$ nvcc -arch=sm_61 -o t19 t19.cu -lcufft 
t19.cu: In function ‘void iProcess(Complex*, double*, size_t)’: 
t19.cu:34:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare] 
    if (cufftStatus != cudaSuccess) { 
           ^
t19.cu:43:32: warning: comparison between ‘cufftResult {aka enum cufftResult_t}’ and ‘enum cudaError’ [-Wenum-compare] 
    if (cufftStatus != cudaSuccess) { 
           ^
$ cuda-memcheck ./t19 
========= CUDA-MEMCHECK 

Printing INPUT 
0  0.840188 
1  0.394383 
2  0.783099 
3  0.79844 
4  0.911647 
5  0.197551 
6  0.335223 
7  0.76823 
8  0.277775 
9  0.55397 
10  0.477397 
11  0.628871 
12  0.364784 
13  0.513401 
14  0.95223 
15  0.916195 
16  0.635712 
17  0.717297 


Printing COMPLEX 
0  5.306536  0.000000 
1  0.015338  -0.734991 
2  -0.218001  0.740248 
3  0.307508  -0.706533 
4  1.022732  0.271765 
5  1.022732  -0.271765 
6  0.307508  0.706533 
7  -0.218001  -0.740248 
8  0.015338  0.734991 
9  5.759857  0.000000 
10  -0.328981  0.788566 
11  0.055356  -0.521014 
12  -0.127504  0.581872 
13  0.014066  0.123027 
14  0.014066  -0.123027 
15  -0.127504  -0.581872 
16  0.055356  0.521014 
17  -0.328981  -0.788566 

Printing REAL 
0  0.840188 
1  0.394383 
2  0.783099 
3  0.79844 
4  0.911647 
5  0.197551 
6  0.335223 
7  0.76823 
8  0.277775 
9  0.55397 
10  0.477397 
11  0.628871 
12  0.364784 
13  0.513401 
14  0.95223 
15  0.916195 
16  0.635712 
17  0.717297 
========= ERROR SUMMARY: 0 errors 
$ 
+0

我已經測試和它完美的作品。謝謝,我想徹底閱讀你提供的文檔鏈接是非常有用的。我建議有麻煩的人先閱讀。 –

相關問題