2014-02-06 62 views
0

考慮哪些9US下運行在K20下面的代碼:__shared__可變奇怪的行爲CUDA

__global__ void histogram(unsigned char *inputPointer, int *outputPointer) 
{ 

    __shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ]; 
    __shared__ unsigned int partHist[ 256 ] ; 

    int i ; 
    int tx = threadIdx.x; 
    int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x; 
    uint8_t val = inputPointer[ pixelOffset ]; 

    uint8_t data = val/ 8 ; 
    uint8_t position = val % 8 ; 

    /**Trying to avoid loops thats why this code */ 
    localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ; 

    __syncthreads(); 

    TURN_ON(localDispersedHistogram [ tx ] [ data ] , position); 

    __syncthreads(); 

    partHist[ tx ] = 0; 

    int k = 0 ; 
    for (int i = 0 ; i < 256 ; i ++) { 
     k++; 
    } 

} 

現在下面的代碼採取72us共享變量的訪問:

__global__ void histogram(unsigned char *inputPointer, int *outputPointer) 
{ 

    __shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ]; 
    __shared__ unsigned int partHist[ 256 ] ; 

    int i ; 
    int tx = threadIdx.x; 
    int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x; 
    uint8_t val = inputPointer[ pixelOffset ]; 

    uint8_t data = val/ 8 ; 
    uint8_t position = val % 8 ; 

    /**Trying to avoid loops thats why this code */ 
    localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ; 

    __syncthreads(); 

    TURN_ON(localDispersedHistogram [ tx ] [ data ] , position); 

    __syncthreads(); 

    partHist[ tx ] = 0; 




    for (int i = 0 ; i < 256 ; i ++) { 
     partHist[ tx ]++; 
    } 

} 

爲什麼共享訪問會產生如此巨大的差異? 我明白共享訪問比寄存器訪問貴,但如果你在上面的代碼,

TURN_ON(localDispersedHistogram [ tx ] [ data ] , position); 

也使用共享變量的行,localDispersedHistogram的操縱怎麼來的花費較少的時間看,只有partHist訪問正在瘋狂的時間?

幫助。

更新: 我的歉意:

我的內核配置爲< < < 256,256 >>>

全碼:

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <stdint.h> 
#include <conio.h> 


#define SIZE_OF_OUTPUT_ARRAY 256 * 256 * 256 
#define SIZE_OF_INPUT_ARRAY 256 * 256 

#define TURN_ON(DATA,POSITION) (DATA|=(1<<(POSITION))) 


__global__ void histogram(unsigned char *inputPointer, int *outputPointer) 
{ 
    #if 1 
    __shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ]; 
    __shared__ long long partHist[ 256 ] ; 

    int i ; 
    int tx = threadIdx.x; 
    int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x; 
    uint8_t val = inputPointer[ pixelOffset ]; 

    uint8_t data = val/ 8 ; 
    uint8_t position = val % 8 ; 


    for (int j = 0 ; j < 32 ; j++) { 
     localDispersedHistogram[ tx ] [ j ] = 0; 
    } 

    __syncthreads(); 

    TURN_ON(localDispersedHistogram [ tx ] [ data ] , position); 

    __syncthreads(); 

    //partHist[ tx ] = 0; 

    int sum = 0 ; 


    for (int i = 0 ; i < 256 ; i ++) { 
     sum += (localDispersedHistogram [ i ] [ tx/ 8 ] & (1 << (tx % 8))) >> (tx % 8) ; 
    } 
    partHist[ tx ] = sum; 

    atomicAdd(&outputPointer[ tx ] , partHist[ tx ] ); 

    #endif 

} 



int main() 
{ 

    #if 1 
    printf(" Code Name, Sad buddy 17 "); 

    unsigned char *inputPointer = (unsigned char *) malloc (SIZE_OF_INPUT_ARRAY); 

    for (int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i ++) { 

     int t = rand() % 256 ; 
     //int t = 0; 
     inputPointer [ i ] = t; 

    } 

    unsigned char *device_inputPointer; 
    int *device_outputPointer; 

    cudaMalloc((void**)&device_inputPointer, SIZE_OF_INPUT_ARRAY); 
    cudaMemcpy(device_inputPointer, inputPointer , SIZE_OF_INPUT_ARRAY, cudaMemcpyHostToDevice); 
    cudaMalloc((void**)&device_outputPointer, 256 * sizeof (int)); 
    cudaMemset(device_outputPointer,0,256 * sizeof (int)); 

    histogram <<< 256 , 256 >>> (device_inputPointer , device_outputPointer ); 


    unsigned int *output = (unsigned int *)malloc (256 * sizeof(int)); 

    cudaMemcpy(output, device_outputPointer , 256 * sizeof(int), cudaMemcpyDeviceToHost); 





    unsigned int CPUHist [ 256 ] ; 
    unsigned int GPUHist [ 256 ] ; 

    for (int i = 0 ; i < 256 ;i ++) { 
     CPUHist[ i ] = 0; 
     GPUHist [ i ] = 0; 

     //printf(" %d " , inputPointer[ i ]); 
    } 


    for (int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i++) { 
     CPUHist[ inputPointer [ i ] ] ++; 
    } 





    int flag = 0 ; 
    for (int i = 0 ; i < 256 ;i ++) { 
     printf(" %d GPUHist %d CPUHist\n" , output[ i ] , CPUHist[i]); 

     if (output[ i ] != CPUHist[i] ) { 
      flag = 1 ; 
     } 
    } 

    printf("\n\n======================\n\n"); 

    if (flag) { 
     printf("TEST CASE FAIL "); 
    } 
    else { 
     printf("TEST CASE Pass"); 
    } 

    printf("\n\n======================\n\n"); 


    cudaDeviceReset(); 

    #endif 
    getch(); 
    return 0; 
} 
+0

你是如何設置你的線程塊的?也許這是銀行衝突的問題? – hubs

+0

你是否只讀取一次計時,或者你是否重複幾次同一工作臺並取平均值? –

+0

什麼是「TURN_ON」,它有什麼作用? – talonmies

回答

1

既然你已經沒有實際存入兩完整案例作爲比較,我推斷你的兩個案例基於你的第一篇文章和你的更新。

當你有這樣的代碼:

int sum = 0 ; 



int k = 0 ; 
for (int i = 0 ; i < 256 ; i ++) { 
    k++; 
} 

partHist[ tx ] = sum; 

atomicAdd(&outputPointer[ tx ] , partHist[ tx ] ); 

(或者即使你的k變量由sum取代,沒關係)編譯器可以找出將始終partHist[tx]最終沒有實際上運行任何以前的代碼。因此它可以優化以前的代碼(即刪除它)並仍然得到相同的結果,並且它會這樣做。代碼執行因此非常短,並且您獲得了〜9us的時序結果。

在另一方面,當你的代碼是張貼:

int sum = 0 ; 


for (int i = 0 ; i < 256 ; i ++) { 
    sum += (localDispersedHistogram [ i ] [ tx/ 8 ] & (1 << (tx % 8))) >> (tx % 8) ; 
} 
partHist[ tx ] = sum; 

atomicAdd(&outputPointer[ tx ] , partHist[ tx ] ); 

那麼這個代碼取決於前面的代碼,以確定結果,編譯器不能優化它。

您或許可以通過與無優化(nvcc -G ...)編譯或其他使用cuobjdump -sass mycode傾倒了在各種情況下生成的彙編代碼得到這種額外的確認,你將在組件級別發現在內核代碼的主要區別,由於編譯器的優化。

每當對代碼進行相對較小的更改併發生執行時間的巨大變化時,我們總是應該懷疑編譯器優化的副作用。