2013-11-27 75 views
0

我無法調試爲其設置斷點的「全球」函數行。我使用NSight菜單中的「啓動CUDA調試」選項進行調試。使用NSight Visual Studio 2010進行CUDA調試

我NSight插件安裝成功了VS 2010,我能夠調試我的其他項目(示例項目NSight調試器中傳來)

我的代碼是在這裏(這是一個有點長,但一般重複同樣的功能) :

#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 
#include "device_launch_parameters.h" 
#include <cuda_runtime.h> 
#include <cufft.h> 
#include <helper_cuda.h> 
#include "book.h" 

#define N (131072) 

__global__ void conjugate(float2 *a) { 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if (idx < N) { 
     a[idx] = cuConjf(a[idx]); 
    } 
} 

__global__ void multWithReference(float2 *signal, float2 *reference) { 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    if (idx < N) { 
     signal[idx].x = signal[idx].x * reference[idx].x; 
     signal[idx].y = signal[idx].y * reference[idx].y; 
    } 
} 

__global__ void shift(float2 *signal, size_t shiftamount, float2* shifted) { 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    *(shifted+((idx+shiftamount)%131072)) = *(signal+idx); 
} 

__global__ void fftshift(float2 *u_d) 
{ 
    int i = blockDim.x * blockIdx.x + threadIdx.x; 

    if(i < 131072) 
    { 
     double a = 1-2*(i&1); 
     u_d[i].x *= a; 
     u_d[i].y *= a; 
    } 
} 


static inline cufftHandle createFFTPlan(cudaStream_t* stream) 
{ 
    cufftHandle plan; 

    if (cudaGetLastError() != cudaSuccess){ 
     fprintf(stderr, "Cuda error: Failed to allocate\n"); 
    } 
    if (cufftPlan1d(&plan, 131072, CUFFT_C2C,1) != CUFFT_SUCCESS){ 
     fprintf(stderr, "CUFFT error: Plan creation failed"); 
    } 
    if (cufftSetStream(plan, *stream) != CUFFT_SUCCESS){ 
     fprintf(stderr, "CUFFT error: Plan stream association failed"); 
    } 

    return plan; 
} 

int main(void) { 

    cudaDeviceProp prop; 
    int whichDevice; 
    HANDLE_ERROR(cudaGetDevice(&whichDevice)); 
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice)); 
    if (!prop.deviceOverlap) { 
     printf("Device will not handle overlaps, so no speed up from streams\n"); 
     return 0; 
    } 

    cudaEvent_t  start, stop; 
    float   elapsedTime; 

    cudaStream_t stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7; 
    float2* host_ref, *host_0, *host_1, *host_2, *host_3, *host_4, *host_5, *host_6, *host_7; 
    float2* dev_ref, *dev_0, *dev_1, *dev_2, *dev_3, *dev_4, *dev_5, *dev_6, *dev_7; 

    // start the timers 
    HANDLE_ERROR(cudaEventCreate(&start)); 
    HANDLE_ERROR(cudaEventCreate(&stop)); 

    // initialize the streams 
    HANDLE_ERROR(cudaStreamCreate(&stream0)); 
    HANDLE_ERROR(cudaStreamCreate(&stream1)); 
    HANDLE_ERROR(cudaStreamCreate(&stream2)); 
    HANDLE_ERROR(cudaStreamCreate(&stream3)); 
    HANDLE_ERROR(cudaStreamCreate(&stream4)); 
    HANDLE_ERROR(cudaStreamCreate(&stream5)); 
    HANDLE_ERROR(cudaStreamCreate(&stream6)); 
    HANDLE_ERROR(cudaStreamCreate(&stream7)); 

    // allocate the memory on the GPU 
    HANDLE_ERROR(cudaMalloc((void**)&dev_ref, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_0, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_1, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_2, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_3, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_4, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_5, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_6, 
           N * sizeof(float2))); 
    HANDLE_ERROR(cudaMalloc((void**)&dev_7, 
           N * sizeof(float2))); 

    // allocate host locked memory, used to stream 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_ref, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_0, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_1, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_2, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_3, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_4, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_5, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_6, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaHostAlloc((void**)&host_7, 
           N * sizeof(float2), 
           cudaHostAllocDefault)); 

    // Open signal file 
    FILE *fp; 
    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_ref, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_0, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_1, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_2, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_3, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_4, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_5, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_6, sizeof(float2), 131072, fp); 
    fclose(fp); 

    if(NULL == (fp = fopen("testSignal4.bin","r"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 
    fread(host_7, sizeof(float2), 131072, fp); 
    fclose(fp); 

    // create FFT plans 
    cufftHandle plan0 = createFFTPlan(&stream0); 
    cufftHandle plan1 = createFFTPlan(&stream1); 
    cufftHandle plan2 = createFFTPlan(&stream2); 
    cufftHandle plan3 = createFFTPlan(&stream3); 
    cufftHandle plan4 = createFFTPlan(&stream4); 
    cufftHandle plan5 = createFFTPlan(&stream5); 
    cufftHandle plan6 = createFFTPlan(&stream6); 
    cufftHandle plan7 = createFFTPlan(&stream7); 

    float2* shifted0; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted0, 
          N * sizeof(float2))); 
    float2* shifted1; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted1, 
          N * sizeof(float2))); 
    float2* shifted2; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted2, 
          N * sizeof(float2))); 
    float2* shifted3; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted3, 
          N * sizeof(float2))); 
    float2* shifted4; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted4, 
          N * sizeof(float2))); 
    float2* shifted5; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted5, 
          N * sizeof(float2))); 
    float2* shifted6; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted6, 
          N * sizeof(float2))); 
    float2* shifted7; 
    HANDLE_ERROR(cudaMalloc((void**)&shifted7, 
          N * sizeof(float2))); 

    HANDLE_ERROR(cudaEventRecord(start, 0)); 

    // enqueue copies of a in stream0 and stream1 
    HANDLE_ERROR(cudaMemcpyAsync(dev_ref, host_ref, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream2)); 

    HANDLE_ERROR(cudaMemcpyAsync(dev_0, host_0, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream0)); 
    HANDLE_ERROR(cudaMemcpyAsync(dev_1, host_1, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream1)); 
    HANDLE_ERROR(cudaMemcpyAsync(dev_2, host_2, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream2)); 
    HANDLE_ERROR(cudaMemcpyAsync(dev_3, host_3, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream3)); 
    HANDLE_ERROR(cudaMemcpyAsync(dev_4, host_4, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream4)); 
    HANDLE_ERROR(cudaMemcpyAsync(dev_5, host_5, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream5)); 
    HANDLE_ERROR(cudaMemcpyAsync(dev_6, host_6, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream6)); 
    HANDLE_ERROR(cudaMemcpyAsync(dev_7, host_7, 
            sizeof(float2), 
            cudaMemcpyHostToDevice, 
            stream7)); 

    for(int i = 0; i < 100; i++){ 

     shift<<<131072,131072,0>>>(dev_0, i, shifted0); 
     shift<<<131072,131072,0,stream1>>>(dev_1, i, shifted1); 
     shift<<<131072,131072,0,stream2>>>(dev_2, i, shifted2); 
     shift<<<131072,131072,0,stream3>>>(dev_3, i, shifted3); 
     shift<<<131072,131072,0,stream4>>>(dev_4, i, shifted4); 
     shift<<<131072,131072,0,stream5>>>(dev_5, i, shifted5); 
     shift<<<131072,131072,0,stream6>>>(dev_6, i, shifted6); 
     shift<<<131072,131072,0,stream7>>>(dev_7, i, shifted7); 

     conjugate<<<131072/256,131072,0,stream0>>>(shifted0); 
     conjugate<<<131072/256,131072,0,stream1>>>(shifted1); 
     conjugate<<<131072/256,131072,0,stream2>>>(shifted2); 
     conjugate<<<131072/256,131072,0,stream3>>>(shifted3); 
     conjugate<<<131072/256,131072,0,stream4>>>(shifted4); 
     conjugate<<<131072/256,131072,0,stream5>>>(shifted5); 
     conjugate<<<131072/256,131072,0,stream6>>>(shifted6); 
     conjugate<<<131072/256,131072,0,stream7>>>(shifted7); 

     multWithReference<<<131072/256,131072,0,stream0>>>(shifted0,dev_ref); 
     multWithReference<<<131072/256,131072,0,stream1>>>(shifted1,dev_ref); 
     multWithReference<<<131072/256,131072,0,stream2>>>(shifted2,dev_ref); 
     multWithReference<<<131072/256,131072,0,stream3>>>(shifted3,dev_ref); 
     multWithReference<<<131072/256,131072,0,stream4>>>(shifted4,dev_ref); 
     multWithReference<<<131072/256,131072,0,stream5>>>(shifted5,dev_ref); 
     multWithReference<<<131072/256,131072,0,stream6>>>(shifted6,dev_ref); 
     multWithReference<<<131072/256,131072,0,stream7>>>(shifted7,dev_ref); 

     if (cufftExecC2C(plan0, shifted0, shifted0, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 
     if (cufftExecC2C(plan1, shifted1, shifted1, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 
     if (cufftExecC2C(plan2, shifted2, shifted2, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 
     if (cufftExecC2C(plan3, shifted3, shifted3, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 
     if (cufftExecC2C(plan4, shifted4, shifted4, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 
     if (cufftExecC2C(plan5, shifted5, shifted5, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 
     if (cufftExecC2C(plan6, shifted6, shifted6, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 
     if (cufftExecC2C(plan7, shifted7, shifted7, CUFFT_FORWARD) != CUFFT_SUCCESS){ 
      fprintf(stderr, "CUFFT error: ExecC2C Forward failed"); 
     } 

     fftshift<<<131072,131072,0,stream0>>>(shifted0); 
     fftshift<<<131072,131072,0,stream1>>>(shifted1); 
     fftshift<<<131072,131072,0,stream2>>>(shifted2); 
     fftshift<<<131072,131072,0,stream3>>>(shifted3); 
     fftshift<<<131072,131072,0,stream4>>>(shifted4); 
     fftshift<<<131072,131072,0,stream5>>>(shifted5); 
     fftshift<<<131072,131072,0,stream6>>>(shifted6); 
     fftshift<<<131072,131072,0,stream7>>>(shifted7); 

    } 

    if (cudaThreadSynchronize() != cudaSuccess){ 
     fprintf(stderr, "Cuda error: Failed to synchronize\n"); 
    } 

    float2 *host_last = (float2 *)malloc(8*131072); 

    // enqueue copies of c from device to locked memory 
    HANDLE_ERROR(cudaMemcpyAsync(host_last, shifted0, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream0)); 

    // enqueue copies of c from device to locked memory 
    HANDLE_ERROR(cudaMemcpyAsync(host_0, shifted0, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream0)); 
    HANDLE_ERROR(cudaMemcpyAsync(host_1, shifted1, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream1)); 
    HANDLE_ERROR(cudaMemcpyAsync(host_2, shifted2, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream2)); 
    HANDLE_ERROR(cudaMemcpyAsync(host_3, shifted3, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream3)); 
    HANDLE_ERROR(cudaMemcpyAsync(host_4, shifted4, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream4)); 
    HANDLE_ERROR(cudaMemcpyAsync(host_5, shifted5, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream5)); 
    HANDLE_ERROR(cudaMemcpyAsync(host_6, shifted6, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream6)); 
    HANDLE_ERROR(cudaMemcpyAsync(host_7, shifted7, 
            sizeof(float2), 
            cudaMemcpyDeviceToHost, 
            stream7)); 
    // Streamleri senkronize et 
    HANDLE_ERROR(cudaStreamSynchronize(stream0)); 
    HANDLE_ERROR(cudaStreamSynchronize(stream1)); 
    HANDLE_ERROR(cudaStreamSynchronize(stream2)); 
    HANDLE_ERROR(cudaStreamSynchronize(stream3)); 
    HANDLE_ERROR(cudaStreamSynchronize(stream4)); 
    HANDLE_ERROR(cudaStreamSynchronize(stream5)); 
    HANDLE_ERROR(cudaStreamSynchronize(stream6)); 
    HANDLE_ERROR(cudaStreamSynchronize(stream7)); 


    // Stop timer 
    HANDLE_ERROR(cudaEventRecord(stop, 0)); 
    HANDLE_ERROR(cudaEventSynchronize(stop)); 
    HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime, 
             start, stop)); 
    printf("Time taken: %3.1f ms\n", elapsedTime); 

    FILE *fp2; 
    if(NULL == (fp2 = fopen("result.bin","wb+"))){ 
     printf("can not open file..."); 
     exit(1); 
    } 

    fwrite(host_last, sizeof(float2), 131072, fp2); 

    printf("signal written \n"); 
    fflush(stdout); 
    fclose(fp2); 

    // cleanup the streams and memory 
    HANDLE_ERROR(cudaFreeHost(host_0)); 
    HANDLE_ERROR(cudaFreeHost(host_1)); 
    HANDLE_ERROR(cudaFreeHost(host_2)); 
    HANDLE_ERROR(cudaFreeHost(host_3)); 
    HANDLE_ERROR(cudaFreeHost(host_4)); 
    HANDLE_ERROR(cudaFreeHost(host_5)); 
    HANDLE_ERROR(cudaFreeHost(host_6)); 
    HANDLE_ERROR(cudaFreeHost(host_7)); 

    HANDLE_ERROR(cudaFree(dev_0)); 
    HANDLE_ERROR(cudaFree(dev_1)); 
    HANDLE_ERROR(cudaFree(dev_2)); 
    HANDLE_ERROR(cudaFree(dev_3)); 
    HANDLE_ERROR(cudaFree(dev_4)); 
    HANDLE_ERROR(cudaFree(dev_5)); 
    HANDLE_ERROR(cudaFree(dev_6)); 
    HANDLE_ERROR(cudaFree(dev_7)); 

    cufftDestroy(plan0); 
    cufftDestroy(plan1); 
    cufftDestroy(plan2); 
    cufftDestroy(plan3); 
    cufftDestroy(plan4); 
    cufftDestroy(plan5); 
    cufftDestroy(plan6); 
    cufftDestroy(plan7); 

    HANDLE_ERROR(cudaStreamDestroy(stream0)); 
    HANDLE_ERROR(cudaStreamDestroy(stream1)); 
    HANDLE_ERROR(cudaStreamDestroy(stream2)); 
    HANDLE_ERROR(cudaStreamDestroy(stream3)); 
    HANDLE_ERROR(cudaStreamDestroy(stream4)); 
    HANDLE_ERROR(cudaStreamDestroy(stream5)); 
    HANDLE_ERROR(cudaStreamDestroy(stream6)); 
    HANDLE_ERROR(cudaStreamDestroy(stream7)); 

    printf("hit [enter] to exit..."); 
    fflush(stdout); 
    getchar(); 

    return 0; 
} 
重現該問題需要

二進制文件是這個環節中:

Binary file

當我運行在發佈exe文件「CUDA-MEMCHECK」我得到以下結果:

memcheck result

+0

我不得不將'#include '和'#include「book.h」'行註釋掉,並在此[post]中添加錯誤檢查(http://stackoverflow.com/questions/14038589/what-the-can-can-way-check-for-errors-using-the-cuda-runtime-api)來成功編譯你的代碼。但是,代碼加載的二進制文件目前缺失。有沒有可能重現你的問題? – JackOLantern

+0

感謝您的更新,我添加了二進制文件。 – uahakan

回答

1

當Nsight VSE調試GPU的代碼,你需要開始通過Nsight菜單調試(「開始CUDA調試」 )。有關更多信息,請參閱this walkthrough

編輯

根據您所提供的附加信息,尤其是cuda-memcheck輸出,好像你的內核實際上沒有被啓動。錯誤9是cudaErrorInvalidConfiguration,指示啓動配置(塊,線程/塊,smem /塊)與設備不兼容。

cudaErrorInvalidConfiguration = 9

這表明在內核啓動的請求不能由當前的設備來滿足資源。每個塊請求比設備支持更多的共享內存將觸發此錯誤,因爲請求太多的線程或塊。有關更多設備限制,請參閱cudaDeviceProp。

事實上,你想發動131072個線程/塊這是遠高於限制(見details併爲specific limits編程指南)。您應該啓動較小的塊並相應地增加塊的數量。

正如Robert Crovella所說,您應該始終確保您有適當的錯誤檢查。

+0

我已經在使用該選項進行調試,我可以調試其他項目。我無法在我的帖子中調試代碼。 – uahakan

+0

你試圖調試一個「Debug」或「Release」項目嗎?如果使用'cuda-memcheck'在Visual Studio之外運行可執行文件,會發生什麼情況? –

+0

我正在嘗試調試「調試」項目。程序使用Visual Studio調試完成,但是當我運行cuda-memcheck時出現錯誤,並將結果圖像添加到我的文章中。 – uahakan

相關問題