我無法調試爲其設置斷點的「全球」函數行。我使用NSight菜單中的「啓動CUDA調試」選項進行調試。使用NSight Visual Studio 2010進行CUDA調試
我NSight插件安裝成功了VS 2010,我能夠調試我的其他項目(示例項目NSight調試器中傳來)
我的代碼是在這裏(這是一個有點長,但一般重複同樣的功能) :
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "device_launch_parameters.h"
#include <cuda_runtime.h>
#include <cufft.h>
#include <helper_cuda.h>
#include "book.h"
#define N (131072)
__global__ void conjugate(float2 *a) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
a[idx] = cuConjf(a[idx]);
}
}
__global__ void multWithReference(float2 *signal, float2 *reference) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
signal[idx].x = signal[idx].x * reference[idx].x;
signal[idx].y = signal[idx].y * reference[idx].y;
}
}
__global__ void shift(float2 *signal, size_t shiftamount, float2* shifted) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
*(shifted+((idx+shiftamount)%131072)) = *(signal+idx);
}
__global__ void fftshift(float2 *u_d)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < 131072)
{
double a = 1-2*(i&1);
u_d[i].x *= a;
u_d[i].y *= a;
}
}
static inline cufftHandle createFFTPlan(cudaStream_t* stream)
{
cufftHandle plan;
if (cudaGetLastError() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to allocate\n");
}
if (cufftPlan1d(&plan, 131072, CUFFT_C2C,1) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan creation failed");
}
if (cufftSetStream(plan, *stream) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: Plan stream association failed");
}
return plan;
}
int main(void) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR(cudaGetDevice(&whichDevice));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
if (!prop.deviceOverlap) {
printf("Device will not handle overlaps, so no speed up from streams\n");
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
cudaStream_t stream0, stream1, stream2, stream3, stream4, stream5, stream6, stream7;
float2* host_ref, *host_0, *host_1, *host_2, *host_3, *host_4, *host_5, *host_6, *host_7;
float2* dev_ref, *dev_0, *dev_1, *dev_2, *dev_3, *dev_4, *dev_5, *dev_6, *dev_7;
// start the timers
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
// initialize the streams
HANDLE_ERROR(cudaStreamCreate(&stream0));
HANDLE_ERROR(cudaStreamCreate(&stream1));
HANDLE_ERROR(cudaStreamCreate(&stream2));
HANDLE_ERROR(cudaStreamCreate(&stream3));
HANDLE_ERROR(cudaStreamCreate(&stream4));
HANDLE_ERROR(cudaStreamCreate(&stream5));
HANDLE_ERROR(cudaStreamCreate(&stream6));
HANDLE_ERROR(cudaStreamCreate(&stream7));
// allocate the memory on the GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_ref,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_0,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_1,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_2,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_3,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_4,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_5,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_6,
N * sizeof(float2)));
HANDLE_ERROR(cudaMalloc((void**)&dev_7,
N * sizeof(float2)));
// allocate host locked memory, used to stream
HANDLE_ERROR(cudaHostAlloc((void**)&host_ref,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_0,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_1,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_2,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_3,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_4,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_5,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_6,
N * sizeof(float2),
cudaHostAllocDefault));
HANDLE_ERROR(cudaHostAlloc((void**)&host_7,
N * sizeof(float2),
cudaHostAllocDefault));
// Open signal file
FILE *fp;
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_ref, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_0, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_1, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_2, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_3, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_4, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_5, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_6, sizeof(float2), 131072, fp);
fclose(fp);
if(NULL == (fp = fopen("testSignal4.bin","r"))){
printf("can not open file...");
exit(1);
}
fread(host_7, sizeof(float2), 131072, fp);
fclose(fp);
// create FFT plans
cufftHandle plan0 = createFFTPlan(&stream0);
cufftHandle plan1 = createFFTPlan(&stream1);
cufftHandle plan2 = createFFTPlan(&stream2);
cufftHandle plan3 = createFFTPlan(&stream3);
cufftHandle plan4 = createFFTPlan(&stream4);
cufftHandle plan5 = createFFTPlan(&stream5);
cufftHandle plan6 = createFFTPlan(&stream6);
cufftHandle plan7 = createFFTPlan(&stream7);
float2* shifted0;
HANDLE_ERROR(cudaMalloc((void**)&shifted0,
N * sizeof(float2)));
float2* shifted1;
HANDLE_ERROR(cudaMalloc((void**)&shifted1,
N * sizeof(float2)));
float2* shifted2;
HANDLE_ERROR(cudaMalloc((void**)&shifted2,
N * sizeof(float2)));
float2* shifted3;
HANDLE_ERROR(cudaMalloc((void**)&shifted3,
N * sizeof(float2)));
float2* shifted4;
HANDLE_ERROR(cudaMalloc((void**)&shifted4,
N * sizeof(float2)));
float2* shifted5;
HANDLE_ERROR(cudaMalloc((void**)&shifted5,
N * sizeof(float2)));
float2* shifted6;
HANDLE_ERROR(cudaMalloc((void**)&shifted6,
N * sizeof(float2)));
float2* shifted7;
HANDLE_ERROR(cudaMalloc((void**)&shifted7,
N * sizeof(float2)));
HANDLE_ERROR(cudaEventRecord(start, 0));
// enqueue copies of a in stream0 and stream1
HANDLE_ERROR(cudaMemcpyAsync(dev_ref, host_ref,
sizeof(float2),
cudaMemcpyHostToDevice,
stream2));
HANDLE_ERROR(cudaMemcpyAsync(dev_0, host_0,
sizeof(float2),
cudaMemcpyHostToDevice,
stream0));
HANDLE_ERROR(cudaMemcpyAsync(dev_1, host_1,
sizeof(float2),
cudaMemcpyHostToDevice,
stream1));
HANDLE_ERROR(cudaMemcpyAsync(dev_2, host_2,
sizeof(float2),
cudaMemcpyHostToDevice,
stream2));
HANDLE_ERROR(cudaMemcpyAsync(dev_3, host_3,
sizeof(float2),
cudaMemcpyHostToDevice,
stream3));
HANDLE_ERROR(cudaMemcpyAsync(dev_4, host_4,
sizeof(float2),
cudaMemcpyHostToDevice,
stream4));
HANDLE_ERROR(cudaMemcpyAsync(dev_5, host_5,
sizeof(float2),
cudaMemcpyHostToDevice,
stream5));
HANDLE_ERROR(cudaMemcpyAsync(dev_6, host_6,
sizeof(float2),
cudaMemcpyHostToDevice,
stream6));
HANDLE_ERROR(cudaMemcpyAsync(dev_7, host_7,
sizeof(float2),
cudaMemcpyHostToDevice,
stream7));
for(int i = 0; i < 100; i++){
shift<<<131072,131072,0>>>(dev_0, i, shifted0);
shift<<<131072,131072,0,stream1>>>(dev_1, i, shifted1);
shift<<<131072,131072,0,stream2>>>(dev_2, i, shifted2);
shift<<<131072,131072,0,stream3>>>(dev_3, i, shifted3);
shift<<<131072,131072,0,stream4>>>(dev_4, i, shifted4);
shift<<<131072,131072,0,stream5>>>(dev_5, i, shifted5);
shift<<<131072,131072,0,stream6>>>(dev_6, i, shifted6);
shift<<<131072,131072,0,stream7>>>(dev_7, i, shifted7);
conjugate<<<131072/256,131072,0,stream0>>>(shifted0);
conjugate<<<131072/256,131072,0,stream1>>>(shifted1);
conjugate<<<131072/256,131072,0,stream2>>>(shifted2);
conjugate<<<131072/256,131072,0,stream3>>>(shifted3);
conjugate<<<131072/256,131072,0,stream4>>>(shifted4);
conjugate<<<131072/256,131072,0,stream5>>>(shifted5);
conjugate<<<131072/256,131072,0,stream6>>>(shifted6);
conjugate<<<131072/256,131072,0,stream7>>>(shifted7);
multWithReference<<<131072/256,131072,0,stream0>>>(shifted0,dev_ref);
multWithReference<<<131072/256,131072,0,stream1>>>(shifted1,dev_ref);
multWithReference<<<131072/256,131072,0,stream2>>>(shifted2,dev_ref);
multWithReference<<<131072/256,131072,0,stream3>>>(shifted3,dev_ref);
multWithReference<<<131072/256,131072,0,stream4>>>(shifted4,dev_ref);
multWithReference<<<131072/256,131072,0,stream5>>>(shifted5,dev_ref);
multWithReference<<<131072/256,131072,0,stream6>>>(shifted6,dev_ref);
multWithReference<<<131072/256,131072,0,stream7>>>(shifted7,dev_ref);
if (cufftExecC2C(plan0, shifted0, shifted0, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan1, shifted1, shifted1, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan2, shifted2, shifted2, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan3, shifted3, shifted3, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan4, shifted4, shifted4, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan5, shifted5, shifted5, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan6, shifted6, shifted6, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
if (cufftExecC2C(plan7, shifted7, shifted7, CUFFT_FORWARD) != CUFFT_SUCCESS){
fprintf(stderr, "CUFFT error: ExecC2C Forward failed");
}
fftshift<<<131072,131072,0,stream0>>>(shifted0);
fftshift<<<131072,131072,0,stream1>>>(shifted1);
fftshift<<<131072,131072,0,stream2>>>(shifted2);
fftshift<<<131072,131072,0,stream3>>>(shifted3);
fftshift<<<131072,131072,0,stream4>>>(shifted4);
fftshift<<<131072,131072,0,stream5>>>(shifted5);
fftshift<<<131072,131072,0,stream6>>>(shifted6);
fftshift<<<131072,131072,0,stream7>>>(shifted7);
}
if (cudaThreadSynchronize() != cudaSuccess){
fprintf(stderr, "Cuda error: Failed to synchronize\n");
}
float2 *host_last = (float2 *)malloc(8*131072);
// enqueue copies of c from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_last, shifted0,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream0));
// enqueue copies of c from device to locked memory
HANDLE_ERROR(cudaMemcpyAsync(host_0, shifted0,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream0));
HANDLE_ERROR(cudaMemcpyAsync(host_1, shifted1,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream1));
HANDLE_ERROR(cudaMemcpyAsync(host_2, shifted2,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream2));
HANDLE_ERROR(cudaMemcpyAsync(host_3, shifted3,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream3));
HANDLE_ERROR(cudaMemcpyAsync(host_4, shifted4,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream4));
HANDLE_ERROR(cudaMemcpyAsync(host_5, shifted5,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream5));
HANDLE_ERROR(cudaMemcpyAsync(host_6, shifted6,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream6));
HANDLE_ERROR(cudaMemcpyAsync(host_7, shifted7,
sizeof(float2),
cudaMemcpyDeviceToHost,
stream7));
// Streamleri senkronize et
HANDLE_ERROR(cudaStreamSynchronize(stream0));
HANDLE_ERROR(cudaStreamSynchronize(stream1));
HANDLE_ERROR(cudaStreamSynchronize(stream2));
HANDLE_ERROR(cudaStreamSynchronize(stream3));
HANDLE_ERROR(cudaStreamSynchronize(stream4));
HANDLE_ERROR(cudaStreamSynchronize(stream5));
HANDLE_ERROR(cudaStreamSynchronize(stream6));
HANDLE_ERROR(cudaStreamSynchronize(stream7));
// Stop timer
HANDLE_ERROR(cudaEventRecord(stop, 0));
HANDLE_ERROR(cudaEventSynchronize(stop));
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
start, stop));
printf("Time taken: %3.1f ms\n", elapsedTime);
FILE *fp2;
if(NULL == (fp2 = fopen("result.bin","wb+"))){
printf("can not open file...");
exit(1);
}
fwrite(host_last, sizeof(float2), 131072, fp2);
printf("signal written \n");
fflush(stdout);
fclose(fp2);
// cleanup the streams and memory
HANDLE_ERROR(cudaFreeHost(host_0));
HANDLE_ERROR(cudaFreeHost(host_1));
HANDLE_ERROR(cudaFreeHost(host_2));
HANDLE_ERROR(cudaFreeHost(host_3));
HANDLE_ERROR(cudaFreeHost(host_4));
HANDLE_ERROR(cudaFreeHost(host_5));
HANDLE_ERROR(cudaFreeHost(host_6));
HANDLE_ERROR(cudaFreeHost(host_7));
HANDLE_ERROR(cudaFree(dev_0));
HANDLE_ERROR(cudaFree(dev_1));
HANDLE_ERROR(cudaFree(dev_2));
HANDLE_ERROR(cudaFree(dev_3));
HANDLE_ERROR(cudaFree(dev_4));
HANDLE_ERROR(cudaFree(dev_5));
HANDLE_ERROR(cudaFree(dev_6));
HANDLE_ERROR(cudaFree(dev_7));
cufftDestroy(plan0);
cufftDestroy(plan1);
cufftDestroy(plan2);
cufftDestroy(plan3);
cufftDestroy(plan4);
cufftDestroy(plan5);
cufftDestroy(plan6);
cufftDestroy(plan7);
HANDLE_ERROR(cudaStreamDestroy(stream0));
HANDLE_ERROR(cudaStreamDestroy(stream1));
HANDLE_ERROR(cudaStreamDestroy(stream2));
HANDLE_ERROR(cudaStreamDestroy(stream3));
HANDLE_ERROR(cudaStreamDestroy(stream4));
HANDLE_ERROR(cudaStreamDestroy(stream5));
HANDLE_ERROR(cudaStreamDestroy(stream6));
HANDLE_ERROR(cudaStreamDestroy(stream7));
printf("hit [enter] to exit...");
fflush(stdout);
getchar();
return 0;
}
重現該問題需要
二進制文件是這個環節中:
當我運行在發佈exe文件「CUDA-MEMCHECK」我得到以下結果:
我不得不將'#include'和'#include「book.h」'行註釋掉,並在此[post]中添加錯誤檢查(http://stackoverflow.com/questions/14038589/what-the-can-can-way-check-for-errors-using-the-cuda-runtime-api)來成功編譯你的代碼。但是,代碼加載的二進制文件目前缺失。有沒有可能重現你的問題? –
JackOLantern
感謝您的更新,我添加了二進制文件。 – uahakan