我已經成功地使用了CUFFT庫CUDA 3,但相同的代碼將無法在CUDA 4.運行帶有CUDA 4,我得到一個運行時錯誤(CUDA_INVALID_VALUE )執行FFT時。這是一個前向實到複雜的一維變換。我在CUDA 3和CUDA 4之間的CUFFT文檔中看到的唯一一件事是增加了FFTW兼容性模式。我將其設置爲純模式。CUFFT庫作品CUDA 3,但給出了CUDA運行時錯誤無效值4
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int Nfft, Navg, iAvg, N, n1, n2, Npsd, size[2];
float *hReal;
float *pPxx;
float *dReal;
float *dAvg, *dSum, *dWindow;
float U;
long lAvg, lSum, lWindow;
cufftHandle hPlan;
cufftComplex *dComplex;
cufftResult result;
int nBlocks, blockSize;
if (nrhs == 12)
{
Nfft = mxGetScalar(prhs[0]);
blockSize = mxGetScalar(prhs[1]);
Navg = mxGetScalar(prhs[2]);
iAvg = mxGetScalar(prhs[3]);
U = mxGetScalar(prhs[4]);
n1 = mxGetScalar(prhs[5]);
n2 = mxGetScalar(prhs[6]);
hPlan = (cufftHandle)mxGetScalar(prhs[7]);
hReal = (float *)mxGetData(prhs[8]);
lWindow = (long)mxGetScalar(prhs[9]);
lAvg = (long)mxGetScalar(prhs[10]);
lSum = (long)mxGetScalar(prhs[11]);
}
else
mexErrMsgTxt("fftcuda: Function requires 12 inputs");
// pointers to GPU arrays
dWindow = (float *)lWindow;
dAvg = (float *)lAvg;
dSum = (float *)lSum;
// size of output array
N = Nfft/2 + 1;
Npsd = n2 - n1 + 1;
size[0] = 1;
size[1] = Npsd;
/* Allocate working arrays on device */
cudaMalloc((void**)&dReal,sizeof(float)*Nfft);
cudaMalloc((void**)&dComplex,sizeof(cufftComplex)*N);
/* Copy input array to the device */
cudaMemcpy((void*)dReal,(void*)hReal,sizeof(float)*Nfft,cudaMemcpyHostToDevice);
// setup for cuda functions
nBlocks = (int)(Nfft/blockSize);
/* multiply input array by window */
cudaMult <<< nBlocks, blockSize >>> (dReal,dWindow,dReal,Nfft);
/* Execute FFT on device */
result = cufftExecR2C(hPlan, (cufftReal *)dReal, dComplex);
if (result == CUFFT_SETUP_FAILED)
mexErrMsgTxt("CUFFT library failed to initialize.");
else if (result == CUFFT_INVALID_PLAN)
mexErrMsgTxt("The hPlan parameter is not a valid handle.");
else if (result == CUFFT_INVALID_VALUE)
mexErrMsgTxt("The idata or odata parameter is not valid.");
else if (result == CUFFT_EXEC_FAILED)
mexErrMsgTxt("CUFFT failed to execute the transform on GPU.");
// setup for cuda functions
nBlocks = (int)(Npsd/blockSize) + (Npsd%blockSize);
/* Compute absolute value */
cudaAbs <<< nBlocks, blockSize >>> (&dComplex[n1-1],dReal,Npsd);
if (nlhs != 1)
mexErrMsgTxt("fftcuda: Function requires 1 output: float pPxx");
plhs[0]=mxCreateNumericArray(2,size,mxSINGLE_CLASS,mxREAL);
pPxx = (float *)mxGetData(plhs[0]);
/* Copy result back to host */
cudaMemcpy((void*)pPxx, (void*)dReal, sizeof(float)*Npsd,cudaMemcpyDeviceToHost);
/* free working arrays from gpu memory */
cudaFree((void*)dReal);
cudaFree((void*)dComplex);
return;
}
你可能想提一提這一切是怎麼回事Matlab的MEX函數中..... – talonmies