我正在分析花費在內核上的總時間,運行多個時間,並想知道這段代碼是否會給我流內核上的總花費,或者如果返回的時間需要乘以啓動次數。測量使用流時內核花費的總時間
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
for(x=0; x<SIZE; x+=N*2){
gpuErrchk(cudaMemcpyAsync(data_d0, data_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(data_d1, data_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1));
gpuErrchk(cudaMemcpyAsync(array_d0, array_h, wrap->size*sizeof(node_r), cudaMemcpyHostToDevice, stream0));
gpuErrchk(cudaMemcpyAsync(array_d1, array_h, wrap->size*sizeof(node_r), cudaMemcpyHostToDevice, stream1));
cudaEventRecord(start, 0);
GPU<<<N/512,512,0,stream0>>>(array_d0, data_d0, out_d0);
GPU<<<N/512,512,0,stream1>>>(array_d1, data_d1, out_d1);
cudaEventRecord(stop, 0);
gpuErrchk(cudaMemcpyAsync(out_h+x, out_d0 , N * sizeof(int), cudaMemcpyDeviceToHost, stream0));
gpuErrchk(cudaMemcpyAsync(out_h+x+N, out_d1 ,N * sizeof(int), cudaMemcpyDeviceToHost, stream1));
}
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Time %f ms\n", elapsedTime);