編輯 - 代碼塊包含數據。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
#include <cuda.h>
typedef enum signaltype {REAL, COMPLEX} signal;
typedef float2 Complex;
printData(Complex *a, int size, char *msg) {
if (msg == "") printf("\n");
else printf("%s\n", msg);
for (int i = 0; i < size; i++)
printf("%f %f\n", a[i].x, a[i].y);
normData(Complex *a, int size, float norm) {
for (int i = 0; i < size; i++) {
a[i].x /= norm;
a[i].y /= norm;
// flag = 1 for real signals.
randomFill(Complex *h_signal, int size, int flag) {
// Real signal.
if (flag == REAL) {
for (int i = 0; i < size; i++) {
h_signal[i].x = rand()/(float) RAND_MAX;
h_signal[i].y = 0;
// FFT a signal that's on the _DEVICE_.
signalFFT(Complex *d_signal, int signal_size) {
// Handle type used to store and execute CUFFT plans.
// Essentially allocates the resouecwes and sort of interns
// them.
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
printf("Failed to plan FFT\n");
// Execute the plan.
if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_FORWARD) != CUFFT_SUCCESS) {
printf ("Failed Executing FFT\n");
// Reverse of the signalFFT(.) function.
signalIFFT(Complex *d_signal, int signal_size) {
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
printf("Failed to plan IFFT\n");
if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_INVERSE) != CUFFT_SUCCESS) {
printf ("Failed Executing FFT\n");
// Pointwise Multiplication Kernel.
__global__ void
pwProd(Complex *signal1, int size1, Complex *signal2, int size2) {
int threadsPerBlock, blockId, globalIdx;
threadsPerBlock = blockDim.x * blockDim.y;
blockId = blockIdx.x + (blockIdx.y * gridDim.x);
globalIdx = (blockId * threadsPerBlock) + threadIdx.x + (threadIdx.y * blockDim.x);
if (globalIdx <= size1) {
signal1[globalIdx].x = (signal1[globalIdx].x * signal2[globalIdx].x - signal1[globalIdx].y * signal2[globalIdx].y);
signal1[globalIdx].y = (signal1[globalIdx].x * signal2[globalIdx].y + signal1[globalIdx].y * signal2[globalIdx].x);
cudaConvolution(Complex *d_signal1, int size1, Complex *d_signal2,
int size2, dim3 blockSize, dim3 gridSize) {
signalFFT(d_signal1, size1);
signalFFT(d_signal2, size2);
pwProd<<<gridSize, blockSize>>>(d_signal1, size1, d_signal2, size2);
//signalIFFT(d_signal1, size1);
int allocateAndPad(Complex **a, int s1, Complex **b, int s2) {
int oldsize, newsize, i;
newsize = s1 + s2 - 1;
while (!((newsize != 0) && !(newsize & (newsize - 1)))) {
oldsize = s1;
*a = (Complex *) malloc(sizeof(Complex) * newsize);
for (i = oldsize; i < newsize; i++) {
(*a)[i].x = 0;
(*a)[i].y = 0;
oldsize = s2;
*b = (Complex *) malloc(sizeof(Complex) * newsize);
for (i = oldsize; i < newsize; i++) {
(*b)[i].x = 0;
(*b)[i].y = 0;
return newsize;
int main()
Complex *h1, *h2, *d1, *d2;
int s1, s2, newsize, i, dim;
int deviceCount;
cudaError_t e = cudaGetDeviceCount(&deviceCount);
if (e != cudaSuccess) {
return -1;
dim = 1;
s1 = 16;
s2 = 16;
for (i = 0; i < dim; i++) {
newsize = allocateAndPad(&h1, s1, &h2, s2);
/*h1 = (Complex *) malloc(sizeof(Complex) * s1);
h2 = (Complex *) malloc(sizeof(Complex) * s2);
newsize = 16;*/
randomFill(h1, s1, REAL);
randomFill(h2, s2, REAL);
// Kernel Block and Grid Size.
const dim3 blockSize(16, 16, 1);
const dim3 gridSize(newsize/16 + 1, newsize/16 + 1, 1);
printData(h1, newsize, "H Signal 1");
printData(h2, newsize, "H Signal 2");
cudaMalloc(&d1, sizeof(Complex) * newsize);
cudaMalloc(&d2, sizeof(Complex) * newsize);
cudaMemcpy(d1, h1, sizeof(Complex) * newsize, cudaMemcpyHostToDevice);
cudaMemcpy(d2, h2, sizeof(Complex) * newsize, cudaMemcpyHostToDevice);
cudaConvolution(d1, newsize, d2, newsize, blockSize, gridSize);
// Explicit code run below,
/*signalFFT(d1, newsize);
cudaMemcpy(h1, d1, sizeof(Complex) * newsize, cudaMemcpyDeviceToHost);
printData(h1, newsize, "1 FFT");
cudaMemcpy(d1, h1, sizeof(Complex) * newsize, cudaMemcpyHostToDevice);
signalFFT(d2, newsize);
cudaMemcpy(h2, d2, sizeof(Complex) * newsize, cudaMemcpyDeviceToHost);
printData(h2, newsize, "2 FFT");
cudaMemcpy(d2, h2, sizeof(Complex) * newsize, cudaMemcpyHostToDevice);
pwProd<<<gridSize, blockSize>>>(d1, newsize, d2, newsize);
signalIFFT(d1, newsize);*/
cudaMemcpy(h1, d1, sizeof(Complex) * newsize, cudaMemcpyDeviceToHost);
//normData(h1, newsize, newsize);
printData(h1, newsize, "PwProd");
free(h1); free(h2);
cudaFree(d1); cudaFree(d2);
return 0;
EDIT: Required Output Data
0.840188 0.000000
0.394383 0.000000
0.783099 0.000000
0.798440 0.000000
0.911647 0.000000
0.197551 0.000000
0.335223 0.000000
0.768230 0.000000
0.277775 0.000000
0.553970 0.000000
0.477397 0.000000
0.628871 0.000000
0.364784 0.000000
0.513401 0.000000
0.952230 0.000000
0.916195 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000 H Signal 2
0.635712 0.000000
0.717297 0.000000
0.141603 0.000000
0.606969 0.000000
0.016301 0.000000
0.242887 0.000000
0.137232 0.000000
0.804177 0.000000
0.156679 0.000000
0.400944 0.000000
0.129790 0.000000
0.108809 0.000000
0.998924 0.000000
0.218257 0.000000
0.512932 0.000000
0.839112 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000
0.000000 0.000000 PwProd
64.765198 0.000000
-20.097927 72.754028
1.797580 1.074046
-5.184547 7.412243
0.148326 0.121253
-3.457163 3.253345
0.834668 -0.752979
-0.414450 0.328347
-1.268492 0.297919
1.634082 -2.054814
0.542893 0.087469
0.244198 -1.392576
0.680159 -0.110084
0.938037 1.743742
1.318125 -2.269666
-1.448638 1.534995
-0.207152 -0.000000
-1.448638 -1.534995
1.318125 2.269666
0.938037 -1.743742
0.680159 0.110084
0.244198 1.392576
0.542893 -0.087469
1.634082 2.054814
-1.268492 -0.297919
-0.414450 -0.328347
0.834668 0.752980
-3.457164 -3.253347
0.148326 -0.121253
-5.184546 -7.412243
1.797580 -1.074046
-20.097923 -72.754013
我想你沒有注意[當我建議](http://stackoverflow.com/questions/16781653/cuda-inverse-fft-bug)你使用'cufftComplex',而且還建議你發佈實際數據以及您期望的數據。沒有人知道你的意思是「奇/怪」。 –
對不起。我現在添加了這些。 –
問題是,5.0工具包附帶的cuda示例似乎使用Complex沒有任何問題。我懷疑這是否是問題,但是一旦我的代碼正常運行,我就會改變它。 –