使用視覺工作室2010年7贏2.1 Nsight斷點__global__不打
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// incrementArray.cu
#include <stdio.h>
#include <assert.h>
void incrementArrayOnHost(float *a, int N)
{
int i;
for (i=0; i < N; i++) a[i] = a[i]+1.f;
}
__global__ void incrementArrayOnDevice(float *a, int N)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
int j = idx;
int i = 2;
i = i+j; //->breakpoint here
if (idx<N) a[idx] = a[idx]+1.f; //->breakpoint here
}
int main(void)
{
float *a_h, *b_h; // pointers to host memory
float *a_d; // pointer to device memory
int i, N = 10;
size_t size = N*sizeof(float);
// allocate arrays on host
a_h = (float *)malloc(size);
b_h = (float *)malloc(size);
// allocate array on device
cudaMalloc((void **) &a_d, size);
// initialization of host data
for (i=0; i<N; i++) a_h[i] = (float)i;
// copy data from host to device
cudaMemcpy(a_d, a_h, sizeof(float)*N, cudaMemcpyHostToDevice);
// do calculation on host
incrementArrayOnHost(a_h, N);
// do calculation on device:
// Part 1 of 2. Compute execution configuration
int blockSize = 4;
int nBlocks = N/blockSize + (N%blockSize == 0?0:1);
// Part 2 of 2. Call incrementArrayOnDevice kernel
incrementArrayOnDevice <<< nBlocks, blockSize >>> (a_d, N);
// Retrieve result from device and store in b_h
cudaMemcpy(b_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
// check results
for (i=0; i<N; i++) assert(a_h[i] == b_h[i]);
// cleanup
free(a_h); free(b_h); cudaFree(a_d);
return 0;
}
我試着插入如上我全球無效incrementArrayOnDevice的內列出的斷點(浮動*一,詮釋ñ )但他們沒有擊中。當我在visual studio中運行調試(f5)時,我嘗試着進入增量數組操作系統。但他們會跳過整個內核代碼部分。
試圖在變量i和j上添加監視,但出現錯誤「CXX0017:Error:symbol」i「not found」。
此問題是否正常?有人可以試試他們的電腦,讓我知道他們是否可以達到斷點?如果可以,我可能會遇到什麼問題?請幫忙! :(
您嘗試設置斷點的代碼在編譯器輸出中不存在。 CUDA編譯器非常積極地去除在內核寫入內存時沒有任何結果的「死」代碼。如果你使'a [idx]'依賴於'i',它應該被編譯器保留,你應該能夠在調試過程中檢查它。 – talonmies 2012-02-10 14:01:41
你在做本地或遠程調試嗎?我認爲_F5_在Visual Studio中默認啓動本地調試,而Nsight - >啓動CUDA調試將在啓動選項 – pQB 2012-02-10 16:05:47