我試圖提供一個解釋,爲什麼你看不到你的兩個內核的執行重疊。爲此,我構建了下面報告的代碼,它使用您的兩個內核並監視每個塊運行哪個流式多處理器(SM)。我正在使用CUDA 6.5(發佈候選版本),並且使用的GT540M卡只有2
SMs,因此它提供了一個簡單的操作環境。 blockSize
選擇委託給新的CUDA 6.5 cudaOccupancyMaxPotentialBlockSize
工廠。
守則
#include <stdio.h>
#include <time.h>
//#define DEBUG_MODE
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**************************************************/
/* STREAMING MULTIPROCESSOR IDENTIFICATION NUMBER */
/**************************************************/
__device__ unsigned int get_smid(void) {
unsigned int ret;
asm("mov.u32 %0, %smid;" : "=r"(ret));
return ret;
}
/************/
/* KERNEL 1 */
/************/
__global__ void kernel_1(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N) {
data[i] = data[i]/value;
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}
}
//__global__ void kernel_1(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i]/value;
// }
//}
/************/
/* KERNEL 2 */
/************/
__global__ void kernel_2(float * __restrict__ data, const float value, int *sm, int N)
{
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i < N) {
data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
if (threadIdx.x==0) sm[blockIdx.x]=get_smid();
}
}
//__global__ void kernel_2(float* data, float value, int N)
//{
// int start = blockIdx.x * blockDim.x + threadIdx.x;
// for (int i = start; i < N; i += blockDim.x * gridDim.x)
// {
// data[i] = data[i] * (value + sin(double(i)) * 1./cos(double(i)));
// }
//}
/********/
/* MAIN */
/********/
int main()
{
const int N = 10000;
const float value = 5.55f;
const int rep_num = 20;
// --- CPU memory allocations
float *h_data1 = (float*) malloc(N*sizeof(float));
float *h_data2 = (float*) malloc(N*sizeof(float));
float *h_data1_ref = (float*) malloc(N*sizeof(float));
float *h_data2_ref = (float*) malloc(N*sizeof(float));
// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1[i] = rand()/RAND_MAX;
h_data2[i] = rand()/RAND_MAX;
}
// --- GPU memory allocations
float *d_data1, *d_data2;
gpuErrchk(cudaMalloc((void**)&d_data1, N*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_data2, N*sizeof(float)));
// --- CPU -> GPU memory transfers
gpuErrchk(cudaMemcpy(d_data1, h_data1, N*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));
// --- CPU data initializations
srand(time(NULL));
for (int i=0; i<N; i++) {
h_data1_ref[i] = h_data1[i]/value;
h_data2_ref[i] = h_data2[i] * (value + sin(double(i)) * 1./cos(double(i)));
}
// --- Stream creations
cudaStream_t stream1, stream2;
gpuErrchk(cudaStreamCreate(&stream1));
gpuErrchk(cudaStreamCreate(&stream2));
// --- Launch parameters configuration
int blockSize1, blockSize2, minGridSize1, minGridSize2, gridSize1, gridSize2;
cudaOccupancyMaxPotentialBlockSize(&minGridSize1, &blockSize1, kernel_1, 0, N);
cudaOccupancyMaxPotentialBlockSize(&minGridSize2, &blockSize2, kernel_2, 0, N);
gridSize1 = (N + blockSize1 - 1)/blockSize1;
gridSize2 = (N + blockSize2 - 1)/blockSize2;
// --- Allocating space for SM IDs
int *h_sm_11 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_12 = (int*) malloc(gridSize1*sizeof(int));
int *h_sm_21 = (int*) malloc(gridSize2*sizeof(int));
int *h_sm_22 = (int*) malloc(gridSize2*sizeof(int));
int *d_sm_11, *d_sm_12, *d_sm_21, *d_sm_22;
gpuErrchk(cudaMalloc((void**)&d_sm_11, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_12, gridSize1*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_21, gridSize2*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_sm_22, gridSize2*sizeof(int)));
// --- Timing individual kernels
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i<rep_num; i++) kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 1 - elapsed time: %3.3f ms \n", time/rep_num);
cudaEventRecord(start, 0);
for (int i=0; i<rep_num; i++) kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Kernel 2 - elapsed time: %3.3f ms \n", time/rep_num);
// --- No stream case
cudaEventRecord(start, 0);
kernel_1<<<gridSize1, blockSize1>>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data1, d_data1, N*sizeof(float), cudaMemcpyDeviceToHost));
// --- Results check
for (int i=0; i<N; i++) {
if (h_data1[i] != h_data1_ref[i]) {
printf("Kernel1 - Error at i = %i; Host = %f; Device = %f\n", i, h_data1_ref[i], h_data1[i]);
return;
}
}
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1>>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(d_data2, h_data2, N*sizeof(float), cudaMemcpyHostToDevice));
#endif
kernel_2<<<gridSize2, blockSize2>>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(h_data2, d_data2, N*sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) {
if (h_data2[i] != h_data2_ref[i]) {
printf("Kernel2 - Error at i = %i; Host = %f; Device = %f\n", i, h_data2_ref[i], h_data2[i]);
return;
}
}
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("No stream - elapsed time: %3.3f ms \n", time);
// --- Stream case
cudaEventRecord(start, 0);
kernel_1<<<gridSize1, blockSize1, 0, stream1 >>>(d_data1, value, d_sm_11, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_1<<<gridSize1, blockSize1, 0, stream2 >>>(d_data2, value, d_sm_12, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream1 >>>(d_data1, value, d_sm_21, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
kernel_2<<<gridSize2, blockSize2, 0, stream2 >>>(d_data2, value, d_sm_22, N);
#ifdef DEBUG_MODE
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Stream - elapsed time: %3.3f ms \n", time);
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
printf("Test passed!\n");
gpuErrchk(cudaMemcpy(h_sm_11, d_sm_11, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_12, d_sm_12, gridSize1*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_21, d_sm_21, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_sm_22, d_sm_22, gridSize2*sizeof(int), cudaMemcpyDeviceToHost));
printf("Kernel 1: gridSize = %i; blockSize = %i\n", gridSize1, blockSize1);
printf("Kernel 2: gridSize = %i; blockSize = %i\n", gridSize2, blockSize2);
for (int i=0; i<gridSize1; i++) {
printf("Kernel 1 - Data 1: blockNumber = %i; SMID = %d\n", i, h_sm_11[i]);
printf("Kernel 1 - Data 2: blockNumber = %i; SMID = %d\n", i, h_sm_12[i]);
}
for (int i=0; i<gridSize2; i++) {
printf("Kernel 2 - Data 1: blockNumber = %i; SMID = %d\n", i, h_sm_21[i]);
printf("Kernel 2 - Data 2: blockNumber = %i; SMID = %d\n", i, h_sm_22[i]);
}
cudaDeviceReset();
return 0;
}
內核時機與N = 100
和N = 10000
N = 100
kernel_1 0.003ms
kernel_2 0.005ms
N = 10000
kernel_1 0.011ms
kernel_2 0.053ms
因此,內核1比內核更耗費計算2.
結果FOR N = 100
在這種情況下,每個內核啓動僅與一個塊,這是時間軸。

正如你所看到的,發生重疊。通過查看上述結果,調度程序將兩個調用的單個塊與兩個可用的SM並行地發送到內核1,然後對內核2執行相同的操作。這似乎是發生重疊的主要原因。
結果FOR N = 10000
Kernel 1: gridSize = 14; blockSize = 768
Kernel 2: gridSize = 10; blockSize = 1024
Kernel 1 - Data 1: blockNumber = 0; SMID = 0
Kernel 1 - Data 2: blockNumber = 0; SMID = 1
Kernel 1 - Data 1: blockNumber = 1; SMID = 1
Kernel 1 - Data 2: blockNumber = 1; SMID = 0
Kernel 1 - Data 1: blockNumber = 2; SMID = 0
Kernel 1 - Data 2: blockNumber = 2; SMID = 1
Kernel 1 - Data 1: blockNumber = 3; SMID = 1
Kernel 1 - Data 2: blockNumber = 3; SMID = 0
Kernel 1 - Data 1: blockNumber = 4; SMID = 0
Kernel 1 - Data 2: blockNumber = 4; SMID = 1
Kernel 1 - Data 1: blockNumber = 5; SMID = 1
Kernel 1 - Data 2: blockNumber = 5; SMID = 0
Kernel 1 - Data 1: blockNumber = 6; SMID = 0
Kernel 1 - Data 2: blockNumber = 6; SMID = 0
Kernel 1 - Data 1: blockNumber = 7; SMID = 1
Kernel 1 - Data 2: blockNumber = 7; SMID = 1
Kernel 1 - Data 1: blockNumber = 8; SMID = 0
Kernel 1 - Data 2: blockNumber = 8; SMID = 1
Kernel 1 - Data 1: blockNumber = 9; SMID = 1
Kernel 1 - Data 2: blockNumber = 9; SMID = 0
Kernel 1 - Data 1: blockNumber = 10; SMID = 0
Kernel 1 - Data 2: blockNumber = 10; SMID = 0
Kernel 1 - Data 1: blockNumber = 11; SMID = 1
Kernel 1 - Data 2: blockNumber = 11; SMID = 1
Kernel 1 - Data 1: blockNumber = 12; SMID = 0
Kernel 1 - Data 2: blockNumber = 12; SMID = 1
Kernel 1 - Data 1: blockNumber = 13; SMID = 1
Kernel 1 - Data 2: blockNumber = 13; SMID = 0
Kernel 2 - Data 1: blockNumber = 0; SMID = 0
Kernel 2 - Data 2: blockNumber = 0; SMID = 0
Kernel 2 - Data 1: blockNumber = 1; SMID = 1
Kernel 2 - Data 2: blockNumber = 1; SMID = 1
Kernel 2 - Data 1: blockNumber = 2; SMID = 1
Kernel 2 - Data 2: blockNumber = 2; SMID = 0
Kernel 2 - Data 1: blockNumber = 3; SMID = 0
Kernel 2 - Data 2: blockNumber = 3; SMID = 1
Kernel 2 - Data 1: blockNumber = 4; SMID = 1
Kernel 2 - Data 2: blockNumber = 4; SMID = 0
Kernel 2 - Data 1: blockNumber = 5; SMID = 0
Kernel 2 - Data 2: blockNumber = 5; SMID = 1
Kernel 2 - Data 1: blockNumber = 6; SMID = 1
Kernel 2 - Data 2: blockNumber = 6; SMID = 0
Kernel 2 - Data 1: blockNumber = 7; SMID = 0
Kernel 2 - Data 2: blockNumber = 7; SMID = 1
Kernel 2 - Data 1: blockNumber = 8; SMID = 1
Kernel 2 - Data 2: blockNumber = 8; SMID = 0
Kernel 2 - Data 1: blockNumber = 9; SMID = 0
Kernel 2 - Data 2: blockNumber = 9; SMID = 1
這是時間軸:

在這種情況下,不發生重疊。根據上述結果,這並不意味着兩個SM不會同時被利用,但是(我認爲)由於要啓動的塊數量較多,分配兩塊不同的內核或兩塊相同的塊內核在性能方面沒有太大的差別,因此調度器選擇了第二個選項。
我測試過,考慮到每個線程完成的更多工作,行爲保持不變。
當前代碼示例啓動> 2^19 warps。 kernel_diva和kernel_jokea執行的處理很少。計算能力<3.5設備將從第一個內核調度所有工作,然後派遣第二個工作。由於處理時間短,您可能看不到任何重疊。如果你將gridDim減少到(1,1,1)並將每個線程的工作量增加1000倍(只是做一個for循環),你會發現兩個內核之間有重疊嗎?如果每個線程處理多個數據元素以減少啓動和索引計算開銷,那麼您的內核性能可能會大大提高。 –
感謝您的評論。到目前爲止,我認爲一個線程應該只佔用一個數組插槽。我最近發現了一些能夠打破這種假設的筆記,包括http://llpanorama.wordpress.com/2008/06/11/threads-and-blocks-and-grids-oh-my/。我將仔細研究它,並在我獲得重大成果時回到這裏。再次感謝你! –
wordpress文章包含一些不準確的項目。如果您希望對GPU有較低層次的理解,我會建議您觀看2013年度GTC會議性能優化:編程指南和GPU體系結構詳細背後[vid](http://nvidia.fullviewmedia.com/gtc2013/0321-210H- S3466.html)[pdf](http://on-demand.gputechconf.com/gtc/2013/presentations/S3466-Programming-Guidelines-GPU-Architecture.pdf) –