2016-06-15 106 views
-3

我試圖用cudaStream開發一個sobel的例子。這裏是程序:cudaStream奇怪的表現

void SobelStream(void) 
{ 

    cv::Mat imageGrayL2 = cv::imread("/home/xavier/Bureau/Image1.png",0); 


    u_int8_t *u8_PtImageHost; 
    u_int8_t *u8_PtImageDevice; 

    u_int8_t *u8_ptDataOutHost; 
    u_int8_t *u8_ptDataOutDevice; 

    u_int8_t u8_Used[NB_STREAM]; 

    u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t)); 
    checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t))); 

    u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t)); 
    checkCudaErrors(cudaMalloc((void**)&u8_PtImageDevice,WIDTH*HEIGHT*sizeof(u_int8_t))); 


    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>(); 
    checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT)); 
    checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice)); 


    dim3 threads(BLOC_X,BLOC_Y); 
    dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y)); 

    ClearKernel<<<blocks,threads>>>(u8_ptDataOutDevice,WIDTH,HEIGHT); 


    int blockh = HEIGHT/NB_STREAM; 


    Stream = (cudaStream_t *) malloc(NB_STREAM * sizeof(cudaStream_t)); 

    for (int i = 0; i < NB_STREAM; i++) 
    { 
     checkCudaErrors(cudaStreamCreate(&(Stream[i]))); 
    } 

// for(int i=0;i<NB_STREAM;i++) 
// { 
//  cudaSetDevice(0); 
//  cudaStreamCreate(&Stream[i]); 
// } 


    cudaEvent_t Start; 
    cudaEvent_t Stop; 
    cudaEventCreate(&Start); 
    cudaEventCreate(&Stop); 

    cudaEventRecord(Start, 0); 


    ////////////////////////////////////////////////////////// 
    for(int i=0;i<NB_STREAM;i++) 
    { 
     if(i == 0) 
     { 
      int localHeight = blockh; 
      checkCudaErrors(cudaMemcpy2DToArrayAsync(Array_PatchsMaxDevice, 
                 0, 
                 0, 
                 imageGrayL2.data,//u8_PtImageDevice, 
                 WIDTH, 
                 WIDTH, 
                 blockh, 
                 cudaMemcpyHostToDevice , 
                 Stream[i])); 

      dim3 threads(BLOC_X,BLOC_Y); 
      dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHeight/BLOC_Y)); 
      SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1); 
      checkCudaErrors(cudaGetLastError()); 

      u8_Used[i] = 1; 

     }else{ 


      int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM ); 
      int hoffset = HEIGHT/NB_STREAM *i; 
      int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1); 
      int localHeight = min(HEIGHT - (blockh*i),blockh); 

      //printf("hoffset: %d hoffsetkernel %d localHeight %d rest %d ioffsetImage %d \n",hoffset,hoffsetkernel,localHeight,HEIGHT - (blockh +1 +blockh*(i-1)),ioffsetImage*i/WIDTH); 

      checkCudaErrors(cudaMemcpy2DToArrayAsync(Array_PatchsMaxDevice, 
                 0, 
                 hoffset, 
                 &imageGrayL2.data[ioffsetImage*i],//&u8_PtImageDevice[ioffset*i], 
          WIDTH, 
          WIDTH, 
          localHeight, 
          cudaMemcpyHostToDevice , 
          Stream[i])); 


      u8_Used[i] = 1; 
      if(HEIGHT - (blockh +1 +blockh*(i-1))<=0) 
      { 
       break; 
      } 
     } 
    } 



    /////////////////////////////////////////// 
    for(int i=0;i<NB_STREAM;i++) 
    { 
     if(i == 0) 
     { 
      int localHeight = blockh; 


      dim3 threads(BLOC_X,BLOC_Y); 
      dim3 blocks(1,1); 
      SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1); 
      checkCudaErrors(cudaGetLastError()); 

      u8_Used[i] = 1; 

     }else{ 


      int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM ); 
      int hoffset = HEIGHT/NB_STREAM *i; 
      int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1); 
      int localHeight = min(HEIGHT - (blockh*i),blockh); 


      dim3 threads(BLOC_X,BLOC_Y); 
      dim3 blocks(1,1); 

      SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,hoffsetkernel,WIDTH,localHeight); 
      checkCudaErrors(cudaGetLastError()); 

      u8_Used[i] = 1; 
      if(HEIGHT - (blockh +1 +blockh*(i-1))<=0) 
      { 
       break; 
      } 
     } 
    } 


    /////////////////////////////////////////////////////// 
    for(int i=0;i<NB_STREAM;i++) 
    { 
     if(i == 0) 
     { 
      int localHeight = blockh; 
      checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHeight-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i])); 
      u8_Used[i] = 1; 

     }else{ 

      int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM ); 
      int hoffset = HEIGHT/NB_STREAM *i; 
      int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1); 
      int localHeight = min(HEIGHT - (blockh*i),blockh); 

      checkCudaErrors(cudaMemcpyAsync(&u8_ptDataOutHost[hoffsetkernel*WIDTH],&u8_ptDataOutDevice[hoffsetkernel*WIDTH],WIDTH*localHeight*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i])); 

      u8_Used[i] = 1; 
      if(HEIGHT - (blockh +1 +blockh*(i-1))<=0) 
      { 
       break; 
      } 
     } 
    } 


    for(int i=0;i<NB_STREAM;i++) 
    { 
     cudaStreamSynchronize(Stream[i]); 
    } 

    cudaEventRecord(Stop, 0); 

    cudaEventSynchronize(Start); 
    cudaEventSynchronize(Stop); 


    float dt_ms; 
    cudaEventElapsedTime(&dt_ms, Start, Stop); 

    printf("dt_ms %f \n",dt_ms); 

} 

我在執行我的程序時有一個非常奇怪的表現。我決定去分析我的榜樣,我得到的是:

enter image description here

我不明白,似乎每個流都在等待對方。 有人可以幫助我嗎?

+1

問題在哪裏?什麼是意外? – kangshiyin

回答

2

首先,未來請提供完整的代碼。我也正在處理您的交叉發佈here以填寫一些細節,如內核大小。

你有兩個問題需要解決:

首先,要使用cudaMemcpyAsync任何時候,您很可能希望與寄託主機分配來工作。如果您使用創建的分配與malloc,就異步併發執行而言,您不會從cudaMemcpyAsync獲得預期的行爲。 programming guide

如果副本中涉及到主機內存,它必須被頁面鎖定。

因此,爲了使你的代碼中的第一個變化是轉換這樣的:

u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t)); 
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t)); 

這樣:

checkCudaErrors(cudaHostAlloc(&u8_PtImageHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault)); 
checkCudaErrors(cudaHostAlloc(&u8_ptDataOutHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault)); 

單獨這種變化,你的執行時間約爲21ms下降到7ms根據我的測試。這樣做的原因是,如果沒有變化,我們得到沒有任何重疊:

enter image description here

有了變化,副本活動可以相互重疊(H-> d和D-> H)和用內核執行:

enter image description here

你面對去併發內核執行的第二個問題是,你的內核只是太大(太多塊/線程):

#define WIDTH 6400 
#define HEIGHT 4800 
#define NB_STREAM 10 

#define BLOC_X 32 
#define BLOC_Y 32 

    dim3 threads(BLOC_X,BLOC_Y); 
    dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y)); 

我會建議,如果這些是你需要運行的內核的大小,那麼嘗試和爭取內核重疊可能沒有太大的好處 - 每個內核啓動足夠的塊來「填充」GPU,所以你有已經暴露出足夠的並行性來保持GPU的繁忙。但是如果你迫切希望目睹內核併發性,那麼你可以讓你的內核使用更小的塊數,同時使每個內核花費更多的時間執行。我們可以通過啓動1個塊來完成,並且只是每個塊中的線程執行圖像過濾。