__shared__可變奇怪的行爲CUDA

考慮哪些9US下運行在K20下面的代碼：__shared__可變奇怪的行爲CUDA

__global__ void histogram(unsigned char *inputPointer, int *outputPointer) 
{ 

    __shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ]; 
    __shared__ unsigned int partHist[ 256 ] ; 

    int i ; 
    int tx = threadIdx.x; 
    int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x; 
    uint8_t val = inputPointer[ pixelOffset ]; 

    uint8_t data = val/ 8 ; 
    uint8_t position = val % 8 ; 

    /**Trying to avoid loops thats why this code */ 
    localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ; 

    __syncthreads(); 

    TURN_ON(localDispersedHistogram [ tx ] [ data ] , position); 

    __syncthreads(); 

    partHist[ tx ] = 0; 

    int k = 0 ; 
    for (int i = 0 ; i < 256 ; i ++) { 
     k++; 
    } 

}

現在下面的代碼採取72us共享變量的訪問：

__global__ void histogram(unsigned char *inputPointer, int *outputPointer) 
{ 

    __shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ]; 
    __shared__ unsigned int partHist[ 256 ] ; 

    int i ; 
    int tx = threadIdx.x; 
    int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x; 
    uint8_t val = inputPointer[ pixelOffset ]; 

    uint8_t data = val/ 8 ; 
    uint8_t position = val % 8 ; 

    /**Trying to avoid loops thats why this code */ 
    localDispersedHistogram [ tx ] [ tx % 32 ] = 0 ; 

    __syncthreads(); 

    TURN_ON(localDispersedHistogram [ tx ] [ data ] , position); 

    __syncthreads(); 

    partHist[ tx ] = 0; 




    for (int i = 0 ; i < 256 ; i ++) { 
     partHist[ tx ]++; 
    } 

}

爲什麼共享訪問會產生如此巨大的差異？我明白共享訪問比寄存器訪問貴，但如果你在上面的代碼，

TURN_ON(localDispersedHistogram [ tx ] [ data ] , position);

也使用共享變量的行，localDispersedHistogram的操縱怎麼來的花費較少的時間看，只有partHist訪問正在瘋狂的時間？

幫助。

更新：我的歉意：

我的內核配置爲< < < 256,256 >>>

全碼：

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <stdint.h> 
#include <conio.h> 


#define SIZE_OF_OUTPUT_ARRAY 256 * 256 * 256 
#define SIZE_OF_INPUT_ARRAY 256 * 256 

#define TURN_ON(DATA,POSITION) (DATA|=(1<<(POSITION))) 


__global__ void histogram(unsigned char *inputPointer, int *outputPointer) 
{ 
    #if 1 
    __shared__ unsigned char localDispersedHistogram[ 256 ] [ 32 ]; 
    __shared__ long long partHist[ 256 ] ; 

    int i ; 
    int tx = threadIdx.x; 
    int pixelOffset = (blockIdx.x * blockDim.x) + threadIdx.x; 
    uint8_t val = inputPointer[ pixelOffset ]; 

    uint8_t data = val/ 8 ; 
    uint8_t position = val % 8 ; 


    for (int j = 0 ; j < 32 ; j++) { 
     localDispersedHistogram[ tx ] [ j ] = 0; 
    } 

    __syncthreads(); 

    TURN_ON(localDispersedHistogram [ tx ] [ data ] , position); 

    __syncthreads(); 

    //partHist[ tx ] = 0; 

    int sum = 0 ; 


    for (int i = 0 ; i < 256 ; i ++) { 
     sum += (localDispersedHistogram [ i ] [ tx/ 8 ] & (1 << (tx % 8))) >> (tx % 8) ; 
    } 
    partHist[ tx ] = sum; 

    atomicAdd(&outputPointer[ tx ] , partHist[ tx ] ); 

    #endif 

} 



int main() 
{ 

    #if 1 
    printf(" Code Name, Sad buddy 17 "); 

    unsigned char *inputPointer = (unsigned char *) malloc (SIZE_OF_INPUT_ARRAY); 

    for (int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i ++) { 

     int t = rand() % 256 ; 
     //int t = 0; 
     inputPointer [ i ] = t; 

    } 

    unsigned char *device_inputPointer; 
    int *device_outputPointer; 

    cudaMalloc((void**)&device_inputPointer, SIZE_OF_INPUT_ARRAY); 
    cudaMemcpy(device_inputPointer, inputPointer , SIZE_OF_INPUT_ARRAY, cudaMemcpyHostToDevice); 
    cudaMalloc((void**)&device_outputPointer, 256 * sizeof (int)); 
    cudaMemset(device_outputPointer,0,256 * sizeof (int)); 

    histogram <<< 256 , 256 >>> (device_inputPointer , device_outputPointer ); 


    unsigned int *output = (unsigned int *)malloc (256 * sizeof(int)); 

    cudaMemcpy(output, device_outputPointer , 256 * sizeof(int), cudaMemcpyDeviceToHost); 





    unsigned int CPUHist [ 256 ] ; 
    unsigned int GPUHist [ 256 ] ; 

    for (int i = 0 ; i < 256 ;i ++) { 
     CPUHist[ i ] = 0; 
     GPUHist [ i ] = 0; 

     //printf(" %d " , inputPointer[ i ]); 
    } 


    for (int i = 0 ; i < SIZE_OF_INPUT_ARRAY ; i++) { 
     CPUHist[ inputPointer [ i ] ] ++; 
    } 





    int flag = 0 ; 
    for (int i = 0 ; i < 256 ;i ++) { 
     printf(" %d GPUHist %d CPUHist\n" , output[ i ] , CPUHist[i]); 

     if (output[ i ] != CPUHist[i] ) { 
      flag = 1 ; 
     } 
    } 

    printf("\n\n======================\n\n"); 

    if (flag) { 
     printf("TEST CASE FAIL "); 
    } 
    else { 
     printf("TEST CASE Pass"); 
    } 

    printf("\n\n======================\n\n"); 


    cudaDeviceReset(); 

    #endif 
    getch(); 
    return 0; 
}

來源

2014-02-06 user2067797

你是如何設置你的線程塊的？也許這是銀行衝突的問題？ – hubs

你是否只讀取一次計時，或者你是否重複幾次同一工作臺並取平均值？ –

什麼是「TURN_ON」，它有什麼作用？ – talonmies

既然你已經沒有實際存入兩完整案例作爲比較，我推斷你的兩個案例基於你的第一篇文章和你的更新。

當你有這樣的代碼：

int sum = 0 ; 



int k = 0 ; 
for (int i = 0 ; i < 256 ; i ++) { 
    k++; 
} 

partHist[ tx ] = sum; 

atomicAdd(&outputPointer[ tx ] , partHist[ tx ] );

（或者即使你的k變量由sum取代，沒關係）編譯器可以找出將始終partHist[tx]最終沒有實際上運行任何以前的代碼。因此它可以優化以前的代碼（即刪除它）並仍然得到相同的結果，並且它會這樣做。代碼執行因此非常短，並且您獲得了〜9us的時序結果。

在另一方面，當你的代碼是張貼：

int sum = 0 ; 


for (int i = 0 ; i < 256 ; i ++) { 
    sum += (localDispersedHistogram [ i ] [ tx/ 8 ] & (1 << (tx % 8))) >> (tx % 8) ; 
} 
partHist[ tx ] = sum; 

atomicAdd(&outputPointer[ tx ] , partHist[ tx ] );

那麼這個代碼取決於前面的代碼，以確定結果，編譯器不能優化它。

您或許可以通過與無優化（nvcc -G ...）編譯或其他使用cuobjdump -sass mycode傾倒了在各種情況下生成的彙編代碼得到這種額外的確認，你將在組件級別發現在內核代碼的主要區別，由於編譯器的優化。

每當對代碼進行相對較小的更改併發生執行時間的巨大變化時，我們總是應該懷疑編譯器優化的副作用。

來源

2014-02-06 11:23:12

shared可變奇怪的行爲CUDA

回答

__shared__可變奇怪的行爲CUDA

回答

相關問題

shared可變奇怪的行爲CUDA