1D FFT加內核計算與managedCUDA

我想做FFT加內核計算。 FFT：managedCUDA庫內核計算：自己的內核1D FFT加內核計算與managedCUDA

C＃代碼

public void cuFFTreconstruct() { 
       CudaContext ctx = new CudaContext(0); 
       CudaKernel cuKernel = ctx.LoadKernel("kernel_Array.ptx", "cu_ArrayInversion"); 

       float[] fData = new float[Resolution * Resolution * 2]; 
       float[] result = new float[Resolution * Resolution * 2]; 
       CudaDeviceVariable<float> devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2); 
       CudaDeviceVariable<float> copy_devData = new CudaDeviceVariable<float>(Resolution * Resolution * 2); 

       int i, j; 
       Random rnd = new Random(); 
       double avrg = 0.0; 

       for (i = 0; i < Resolution; i++) 
       { 
        for (j = 0; j < Resolution; j++) 
        { 
         fData[(i * Resolution + j) * 2] = i + j * 2; 
         fData[(i * Resolution + j) * 2 + 1] = 0.0f; 
        } 
       } 

       devData.CopyToDevice(fData); 

       CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution * 2, cufftType.C2C, Resolution * 2); 
       plan1D.Exec(devData.DevicePointer, TransformDirection.Forward); 

       cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution/256, Resolution, 1); 
       cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1); 

       cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution); 

       devData.CopyToHost(result); 

       for (i = 0; i < Resolution; i++) 
       { 
        for (j = 0; j < Resolution; j++) 
        { 
         ResultData[i, j, 0] = result[(i * Resolution + j) * 2]; 
         ResultData[i, j, 1] = result[(i * Resolution + j) * 2 + 1]; 
        } 
       } 
       ctx.FreeMemory(devData.DevicePointer); 
       ctx.FreeMemory(copy_devData.DevicePointer); 
      }

內核代碼

//Includes for IntelliSense 
    #define _SIZE_T_DEFINED 
    #ifndef __CUDACC__ 
    #define __CUDACC__ 
    #endif 
    #ifndef __cplusplus 
    #define __cplusplus 
    #endif 


    #include <cuda.h> 
    #include <device_launch_parameters.h> 
    #include <texture_fetch_functions.h> 
    #include "float.h" 
    #include <builtin_types.h> 
    #include <vector_functions.h> 

    // Texture reference 
    texture<float2, 2> texref; 

    extern "C" 
    { 
     __global__ void cu_ArrayInversion(float* data_A, float* data_B, int Resolution) 
     { 
      int image_x = blockIdx.x * blockDim.x + threadIdx.x; 
      int image_y = blockIdx.y; 

      data_B[(Resolution * image_x + image_y) * 2] = data_A[(Resolution * image_y + image_x) * 2]; 
      data_B[(Resolution * image_x + image_y) * 2 + 1] = data_A[(Resolution * image_y + image_x) * 2 + 1]; 
     } 
    }

然而這一方案不能很好地工作。發生了以下錯誤：

ErrorLaunchFailed：設備在執行內核時發生異常。常見原因包括解除引用無效設備指針和訪問超出共享內存的限制。上下文不能使用，所以它必須被銷燬（並且應該創建一個新的）。如果程序要繼續使用CUDA，則此上下文中的所有現有設備內存分配均無效，並且必須進行重構。

來源

2015-11-13 test

謝謝你的消息。

主機代碼

using System; 
using System.Collections.Generic; 
using System.ComponentModel; 
using System.Data; 
using System.Drawing; 
using System.Linq; 
using System.Text; 
using System.Threading.Tasks; 
using System.Windows.Forms; 
using System.Drawing.Imaging; 
using ManagedCuda; 
using ManagedCuda.CudaFFT; 
using ManagedCuda.VectorTypes; 


namespace WFA_CUDA_FFT 
{ 
    public partial class CuFFTMain : Form 
    { 
     float[, ,] FFTData2D; 
     int Resolution; 

     const int cuda_blockNum = 256; 

     public CuFFTMain() 
     { 
      InitializeComponent(); 
      Resolution = 1024; 
     } 

     private void button1_Click(object sender, EventArgs e) 
     { 
      cuFFTreconstruct(); 
     } 
     public void cuFFTreconstruct() 
     { 
      CudaContext ctx = new CudaContext(0); 
      ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx"); 
      CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx); 
      float2[] fData = new float2[Resolution * Resolution]; 
      float2[] result = new float2[Resolution * Resolution]; 
      FFTData2D = new float[Resolution, Resolution, 2]; 
      CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution); 
      CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution); 

      int i, j; 
      Random rnd = new Random(); 
      double avrg = 0.0; 

      for (i = 0; i < Resolution; i++) 
      { 
       for (j = 0; j < Resolution; j++) 
       { 
        fData[i * Resolution + j].x = i + j * 2; 
        avrg += fData[i * Resolution + j].x; 
        fData[i * Resolution + j].y = 0.0f; 
       } 
      } 

      avrg = avrg/(double)(Resolution * Resolution); 

      for (i = 0; i < Resolution; i++) 
      { 
       for (j = 0; j < Resolution; j++) 
       { 
        fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg; 
       } 
      } 

      devData.CopyToDevice(fData); 

      CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution); 
      plan1D.Exec(devData.DevicePointer, TransformDirection.Forward); 

      cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution/cuda_blockNum, Resolution, 1); 
      cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1); 

      cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution); 

      copy_devData.CopyToHost(result); 

      for (i = 0; i < Resolution; i++) 
      { 
       for (j = 0; j < Resolution; j++) 
       { 
        FFTData2D[i, j, 0] = result[i * Resolution + j].x; 
        FFTData2D[i, j, 1] = result[i * Resolution + j].y; 
       } 
      } 

      //Clean up 
      devData.Dispose(); 
      copy_devData.Dispose(); 
      plan1D.Dispose(); 
      CudaContext.ProfilerStop(); 
      ctx.Dispose(); 
     } 
    } 
}

內核代碼

//Includes for IntelliSense 
#define _SIZE_T_DEFINED 
#ifndef __CUDACC__ 
#define __CUDACC__ 
#endif 
#ifndef __cplusplus 
#define __cplusplus 
#endif 


#include <cuda.h> 
#include <device_launch_parameters.h> 
#include <texture_fetch_functions.h> 
#include "float.h" 
#include <builtin_types.h> 
#include <vector_functions.h> 
#include <vector> 

// Texture reference 
texture<float2, 2> texref; 

extern "C" 
{ 
    // Device code 

    __global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution) 
    { 
     int image_x = blockIdx.x * blockDim.x + threadIdx.x; 
     int image_y = blockIdx.y; 

     data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].x; 
     data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].y; 
    } 
}

首先我.Net4.5編譯。此程序無法正常工作，並顯示錯誤（System.BadImageFormatException）。但是，當FFT函數註釋掉時，內核程序會運行。

第二我從.Net 4.5轉向.NET 4.0。 FFT函數可以工作，但內核不會運行並顯示錯誤。

我的電腦是Windows 8.1 Pro和我使用Visual Studio 2013年

來源

2015-11-19 14:30:51 test

FFT計劃將元素的數量（即複數的數量）作爲參數。因此，請在計劃的構造函數的第一個參數中刪除* 2。而批次數的兩倍也沒有意義...

此外，我會使用float2或cuFloatComplex類型（在ManagedCuda.VectorTypes）來表示複數而不是兩個原始浮點數。爲了釋放內存，請使用CudaDeviceVariable的Dispose方法。否則，它稍後將在GC內部調用。

主機代碼將然後是這個樣子：

int Resolution = 512; 
CudaContext ctx = new CudaContext(0); 
CudaKernel cuKernel = ctx.LoadKernel("kernel.ptx", "cu_ArrayInversion"); 

//float2 or cuFloatComplex 
float2[] fData = new float2[Resolution * Resolution]; 
float2[] result = new float2[Resolution * Resolution]; 
CudaDeviceVariable<float2> devData = new CudaDeviceVariable<float2>(Resolution * Resolution); 
CudaDeviceVariable<float2> copy_devData = new CudaDeviceVariable<float2>(Resolution * Resolution); 

int i, j; 
Random rnd = new Random(); 
double avrg = 0.0; 

for (i = 0; i < Resolution; i++) 
{ 
for (j = 0; j < Resolution; j++) 
{ 
    fData[(i * Resolution + j)].x = i + j * 2; 
    fData[(i * Resolution + j)].y = 0.0f; 
} 
} 

devData.CopyToDevice(fData); 

//Only Resolution times in X and Resolution batches 
CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution); 
plan1D.Exec(devData.DevicePointer, TransformDirection.Forward); 

cuKernel.GridDimensions = new ManagedCuda.VectorTypes.dim3(Resolution/256, Resolution, 1); 
cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(256, 1, 1); 

cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution); 

devData.CopyToHost(result); 

for (i = 0; i < Resolution; i++) 
{ 
    for (j = 0; j < Resolution; j++) 
    { 
     //ResultData[i, j, 0] = result[(i * Resolution + j)].x; 
     //ResultData[i, j, 1] = result[(i * Resolution + j)].y; 
    } 
} 

//And better free memory using Dispose() 
//ctx.FreeMemory is only meant for raw device pointers obtained from somewhere else... 
devData.Dispose(); 
copy_devData.Dispose(); 
plan1D.Dispose(); 
//For Cuda Memory checker and profiler: 
CudaContext.ProfilerStop(); 
ctx.Dispose();

來源

2015-11-13 23:06:30 kunzmi

也請發表您的更新主機代碼或再次檢查是否符合上面的代碼。如果我讓這兩個內核與我在這裏發佈的主機代碼一起運行，那麼一切運行良好。 Cuda內存檢查器找不到任何內容並且沒有錯誤信息。 – kunzmi

謝謝你的評論。我檢查了我的代碼。但是我找不到錯誤。我的程序是通過參考網站：（安裝程序：https：//managedcuda.codeplex.com/documentatio，dll：https：//github.com/kunzmi/managedCuda，示例代碼：https：//github.com/ kunzmi/managedCuda）。當我使用managedCUDA製作2D cuda和1D cuda時，這些程序運行良好，我可以獲得良好的FFT結果。 – test

請準確地發佈您的主機代碼，因爲您可以重現您的問題。再說一遍：我上面發佈的主機代碼與內核一起運行沒有問題，所以一定有區別。 – kunzmi

謝謝你這個建議。

我嘗試了建議的代碼。但是，錯誤仍然存在。（錯誤：ErrorLaunchFailed：執行內核時在設備上發生異常，常見原因包括解引用無效的設備指針和訪問超出界限的共享內存。上下文無法使用，因此必須銷燬（並且新的應該被創建）。從該上下文中的所有現有的設備的內存分配無效，並且如果節目是繼續使用CUDA必須被重建。）

要使用FLOAT2，我改變Cu的代碼如下

extern "C" 
{ 

__global__ void cu_ArrayInversion(float2* data_A, float2* data_B, int Resolution) 
    { 
    int image_x = blockIdx.x * blockDim.x + threadIdx.x; 
    int image_y = blockIdx.y; 

    data_B[(Resolution * image_x + image_y)].x = data_A[(Resolution * image_y + image_x)].x; 
    data_B[(Resolution * image_x + image_y)].y = data_A[(Resolution * image_y + image_x)].y; 
}

當程序執行「cuKernel.Run」時，進程停止。

PTX文件

.version 4.3 
.target sm_20 
.address_size 32 

    // .globl cu_ArrayInversion 
.global .texref texref; 

.visible .entry cu_ArrayInversion(
    .param .u32 cu_ArrayInversion_param_0, 
    .param .u32 cu_ArrayInversion_param_1, 
    .param .u32 cu_ArrayInversion_param_2 
) 
{ 
    .reg .f32 %f<5>; 
    .reg .b32 %r<17>; 


    ld.param.u32 %r1, [cu_ArrayInversion_param_0]; 
    ld.param.u32 %r2, [cu_ArrayInversion_param_1]; 
    ld.param.u32 %r3, [cu_ArrayInversion_param_2]; 
    cvta.to.global.u32 %r4, %r2; 
    cvta.to.global.u32 %r5, %r1; 
    mov.u32  %r6, %ctaid.x; 
    mov.u32  %r7, %ntid.x; 
    mov.u32  %r8, %tid.x; 
    mad.lo.s32 %r9, %r7, %r6, %r8; 
    mov.u32  %r10, %ctaid.y; 
    mad.lo.s32 %r11, %r10, %r3, %r9; 
    shl.b32  %r12, %r11, 3; 
    add.s32  %r13, %r5, %r12; 
    mad.lo.s32 %r14, %r9, %r3, %r10; 
    shl.b32  %r15, %r14, 3; 
    add.s32  %r16, %r4, %r15; 
    ld.global.v2.f32 {%f1, %f2}, [%r13]; 
    st.global.v2.f32 [%r16], {%f1, %f2}; 
    ret; 
}

來源

2015-11-17 14:08:26 test

1D FFT加內核計算與managedCUDA

回答

相關問題