我試圖探索我的Nvidia Quadro 4000的併發內核執行屬性,它具有2.0的能力。cuda內核不能同時執行
我用2個不同的數據流,其運行相同如下:
- 複製H2D 2個鎖定的存儲
- 運行內核
- Copyt D2H兩個不同的區塊,以固定內存的不同塊。
兩個流的內核完全相同,每個執行時間爲190毫秒。
在Visual分析器(版本5.0)我預期兩邊的內核以同時開始執行,然而,它們由20毫秒僅重疊。 這裏是代碼示例:
enter code here
//initiate the streams
cudaStream_t stream0,stream1;
CHK_ERR(cudaStreamCreate(&stream0));
CHK_ERR(cudaStreamCreate(&stream1));
//allocate the memory on the GPU for stream0
CHK_ERR(cudaMalloc((void **)&def_img0, width*height*sizeof(char)));
CHK_ERR(cudaMalloc((void **)&ref_img0, width*height*sizeof(char)));
CHK_ERR(cudaMalloc((void **)&outY_img0,width_size_for_out*height_size_for_out*sizeof(char)));
CHK_ERR(cudaMalloc((void **)&outX_img0,width_size_for_out*height_size_for_out*sizeof(char)));
//allocate the memory on the GPU for stream1
CHK_ERR(cudaMalloc((void **)&def_img1, width*height*sizeof(char)));
CHK_ERR(cudaMalloc((void **)&ref_img1, width*height*sizeof(char)));
CHK_ERR(cudaMalloc((void **)&outY_img1,width_size_for_out*height_size_for_out*sizeof(char)));
CHK_ERR(cudaMalloc((void **)&outX_img1,width_size_for_out*height_size_for_out*sizeof(char)));
//allocate page-locked memory for stream0
CHK_ERR(cudaHostAlloc((void**)&host01, width*height*sizeof(char), cudaHostAllocDefault));
CHK_ERR(cudaHostAlloc((void**)&host02, width*height*sizeof(char), cudaHostAllocDefault));
CHK_ERR(cudaHostAlloc((void**)&host03, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
CHK_ERR(cudaHostAlloc((void**)&host04, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
//allocate page-locked memory for stream1
CHK_ERR(cudaHostAlloc((void**)&host11, width*height*sizeof(char), cudaHostAllocDefault));
CHK_ERR(cudaHostAlloc((void**)&host12, width*height*sizeof(char), cudaHostAllocDefault));
CHK_ERR(cudaHostAlloc((void**)&host13, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
CHK_ERR(cudaHostAlloc((void**)&host14, width_size_for_out*height_size_for_out*sizeof(char), cudaHostAllocDefault));
memcpy(host01,in1,width*height*sizeof(char));
memcpy(host02,in2,width*height*sizeof(char));
memcpy(host11,in1,width*height*sizeof(char));
memcpy(host12,in2,width*height*sizeof(char));
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 dimBlock(CUDA_BLOCK_DIM, CUDA_BLOCK_DIM);
dim3 Grid((width-SEARCH_RADIUS*2-1)/(dimBlock.x*4)+1, (height-SEARCH_RADIUS*2-1)/(dimBlock.y*4)+1);
cudaEventRecord(start,0);
// --------------------
// Copy images to device
// --------------------
//enqueue copies of def stream0 and stream1
CHK_ERR(cudaMemcpyAsync(def_img0, host01,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
CHK_ERR(cudaMemcpyAsync(def_img1, host11,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
//enqueue copies of ref stream0 and stream1
CHK_ERR(cudaMemcpyAsync(ref_img0, host02,width*height*sizeof(char), cudaMemcpyHostToDevice,stream0));
CHK_ERR(cudaMemcpyAsync(ref_img1, host12,width*height*sizeof(char), cudaMemcpyHostToDevice,stream1));
CHK_ERR(cudaStreamSynchronize(stream0));
CHK_ERR(cudaStreamSynchronize(stream1));
//CALLING KERNEL
//enqueue kernel in stream0 and stream1
TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream0>>>(def_img0+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img0,outX_img0,outY_img0,width,width_size_for_out)),"exhaustiveSearchKernel stream0");
TIME_KERNEL((exhaustiveSearchKernel<CUDA_BLOCK_DIM*4,CUDA_BLOCK_DIM*4,SEARCH_RADIUS><<< Grid, dimBlock,0,stream1>>>(def_img1+SEARCH_RADIUS*width+SEARCH_RADIUS,ref_img1,outX_img1,outY_img1,width,width_size_for_out)),"exhaustiveSearchKernel stream1");
//Copy result back
CHK_ERR(cudaMemcpyAsync(host03, outX_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
CHK_ERR(cudaMemcpyAsync(host13, outX_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
CHK_ERR(cudaMemcpyAsync(host04, outY_img0, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream0));
CHK_ERR(cudaMemcpyAsync(host14, outY_img1, width_size_for_out*height_size_for_out*sizeof(char), cudaMemcpyDeviceToHost,stream1));
CHK_ERR(cudaStreamSynchronize(stream0));
CHK_ERR(cudaStreamSynchronize(stream1));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time=%f ms\n",time);
memcpy(outX,host03,width_size_for_out*height_size_for_out*sizeof(char));
memcpy(outY,host04,width_size_for_out*height_size_for_out*sizeof(char));
cudaEventDestroy(start);
cudaEventDestroy(stop);
CHK_ERR(cudaStreamDestroy(stream0));
CHK_ERR(cudaStreamDestroy(stream1));
CHK_ERR(cudaDeviceReset());
}
您還沒有真正問了一個問題在這裏。 – talonmies
請提供使用的API調用序列以及內核的啓動配置。設備CC 2.x計算工作分配器將在爲第二個內核分配工作之前分配第一個內核的所有工作。 –
你還沒有提到內核的啓動配置(塊和網格大小),這仍然沒有真正的問題.... – talonmies