CUDA中的粒子細胞計數（一維和二維直方圖生成）

該__global__函數用於遞增數並計算某些細胞中有多少個粒子。CUDA中的粒子細胞計數（一維和二維直方圖生成）

__global__ void Set_Nc_GPU_0831(int *nc,int *index,SP DSMC) 
{ 
    int tidx; 
    tidx=threadIdx.x+blockDim.x*blockIdx.x; 

    atomicAdd(&nc[index[tidx]],1); 
}

我正在使用慢的原子操作。所以我想用一些其他函數或算法替換原子函數。

有沒有其他方法可以修改這個簡單的__global__函數？

來源

2012-08-31 user1638280

的典型解決方案是使用部分陣列的共享內存，這樣就可以做到快速積累沒有原子操作，然後在最後你可以總結所有這些部分陣列。當然，這裏假定'nc'將會適合共享內存，但是你沒有告訴我們關於你的數據結構的大小。 –

我認爲審查並行減少將有助於你。（包含代碼片段） - http://people.maths.ox.ac.uk/gilesm/cuda/lecs/lec4.pdf – Sayan

這是一個較晚的答案，用於從未答覆的列表中刪除此問題。

您已經認識到，計算在特定單元格內有多少個粒子落入等同於構建直方圖。直方圖的構建是一個深入研究的問題。 Shane Cook的書（CUDA編程）包含了關於這個主題的一個很好的討論。此外，CUDA示例包含一個直方圖示例。此外，histogram construction by CUDA Thrust也是可能的。最後，CUDA Programming Blog包含更多的見解。

在下面，我提供了一個代碼，比較5個不同版本的直方圖計算：

CPU; GPU與原子（基本上你的方法）;
GPU與原子共享內存與部分直方圖的最終總和（基本上，由Paul R提出的方法）;
GPU使用CUDA Thrust。

如果您運行數據的10MB的對開普勒K20C的典型案例的代碼，你會得到以下計時：

CPU = 83ms;
GPU with atomics = 15.8ms;
GPU與原子共享內存= 17.7ms;
GPU by CUDA Thrust = 40ms。

正如你所看到的，令人驚訝的是，你的「蠻力」解決方案是最快的。由於最新架構（您的帖子日期爲2012年8月，開普勒尚未發佈，至少在歐洲發佈），這可能是合理的，原子操作速度非常快。

下面是代碼：

#include <thrust/device_vector.h> 
#include <thrust/sort.h> 
#include <thrust/generate.h> 
#include <thrust/adjacent_difference.h> 
#include <thrust/binary_search.h> 

#define SIZE (100*1024*1024) // 100 MB 

/**********************************************/ 
/* FUNCTION TO GENERATE RANDOM UNSIGNED CHARS */ 
/**********************************************/ 
unsigned char* big_random_block(int size) { 
    unsigned char *data = (unsigned char*)malloc(size); 
    for (int i=0; i<size; i++) 
     data[i] = rand(); 
    return data; 
} 

/***************************************/ 
/* GPU HISTOGRAM CALCULATION VERSION 1 */ 
/***************************************/ 
__global__ void histo_kernel1(unsigned char *buffer, long size, unsigned int *histo) { 

    // --- The number of threads does not cover all the data size 
    int i = threadIdx.x + blockIdx.x * blockDim.x; 
    int stride = blockDim.x * gridDim.x; 
    while (i < size) { 
     atomicAdd(&histo[buffer[i]], 1); 
     i += stride; 
    } 
} 

/***************************************/ 
/* GPU HISTOGRAM CALCULATION VERSION 2 */ 
/***************************************/ 
__global__ void histo_kernel2(unsigned char *buffer, long size, unsigned int *histo) { 

    // --- Allocating and initializing shared memory to store partial histograms 
    __shared__ unsigned int temp[256]; 
    temp[threadIdx.x] = 0; 
    __syncthreads(); 

    // --- The number of threads does not cover all the data size 
    int i = threadIdx.x + blockIdx.x * blockDim.x; 
    int offset = blockDim.x * gridDim.x; 
    while (i < size) 
    { 
     atomicAdd(&temp[buffer[i]], 1); 
     i += offset; 
    } 
    __syncthreads(); 

    // --- Summing histograms 
    atomicAdd(&(histo[threadIdx.x]), temp[threadIdx.x]); 
} 

/********************/ 
/* CUDA ERROR CHECK */ 
/********************/ 
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

/********/ 
/* MAIN */ 
/********/ 
void main(){ 

    // --- Generating an array of SIZE unsigned chars 
    unsigned char *buffer = (unsigned char*)big_random_block(SIZE); 

    /********************/ 
    /* CPU COMPUTATIONS */ 
    /********************/ 

    // --- Allocating host memory space and initializing the host-side histogram 
    unsigned int histo[256]; 
    for (int i=0; i<256; i++) histo [i] = 0; 

    clock_t start_CPU, stop_CPU; 

    // --- Histogram calculation on the host 
    start_CPU = clock(); 
    for (int i=0; i<SIZE; i++) histo [buffer[i]]++; 
    stop_CPU = clock(); 
    float elapsedTime = (float)(stop_CPU - start_CPU)/(float)CLOCKS_PER_SEC * 1000.0f; 
    printf("Time to generate (CPU): %3.1f ms\n", elapsedTime); 

    // --- Indirect check of the result 
    long histoCount = 0; 
    for (int i=0; i<256; i++) { histoCount += histo[i]; } 
    printf("Histogram Sum: %ld\n", histoCount); 

    /********************/ 
    /* GPU COMPUTATIONS */ 
    /********************/ 

    // --- Initializing the device-side data 
    unsigned char *dev_buffer; 
    gpuErrchk(cudaMalloc((void**)&dev_buffer,SIZE)); 
    gpuErrchk(cudaMemcpy(dev_buffer, buffer, SIZE, cudaMemcpyHostToDevice)); 

    // --- Allocating device memory space for the device-side histogram 
    unsigned int *dev_histo; 
    gpuErrchk(cudaMalloc((void**)&dev_histo,256*sizeof(long))); 

    // --- GPU timing 
    cudaEvent_t start, stop; 
    gpuErrchk(cudaEventCreate(&start)); 
    gpuErrchk(cudaEventCreate(&stop)); 

    // --- ATOMICS 
    // --- Histogram calculation on the device - 2x the number of multiprocessors gives best timing 
    gpuErrchk(cudaEventRecord(start,0)); 
    gpuErrchk(cudaMemset(dev_histo,0,256*sizeof(int))); 
    cudaDeviceProp prop; 
    gpuErrchk(cudaGetDeviceProperties(&prop,0)); 
    int blocks = prop.multiProcessorCount; 
    histo_kernel1<<<blocks*2,256>>>(dev_buffer, SIZE, dev_histo); 

    gpuErrchk(cudaMemcpy(histo,dev_histo,256*sizeof(int),cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaEventRecord(stop,0)); 
    gpuErrchk(cudaEventSynchronize(stop)); 
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop)); 
    printf("Time to generate (GPU): %3.1f ms\n", elapsedTime); 

    histoCount = 0; 
    for (int i=0; i<256; i++) { 
     histoCount += histo[i]; 
    } 
    printf("Histogram Sum: %ld\n", histoCount); 

    // --- Check the correctness of the results via the host 
    for (int i=0; i<SIZE; i++) histo[buffer[i]]--; 
    for (int i=0; i<256; i++) { 
     if (histo[i] != 0) printf("Failure at %d! Off by %d\n", i, histo[i]); 
} 

    // --- ATOMICS IN SHARED MEMORY 
    // --- Histogram calculation on the device - 2x the number of multiprocessors gives best timing 
    gpuErrchk(cudaEventRecord(start,0)); 
    gpuErrchk(cudaMemset(dev_histo,0,256*sizeof(int))); 
    gpuErrchk(cudaGetDeviceProperties(&prop,0)); 
    blocks = prop.multiProcessorCount; 
    histo_kernel2<<<blocks*2,256>>>(dev_buffer, SIZE, dev_histo); 

    gpuErrchk(cudaMemcpy(histo,dev_histo,256*sizeof(int),cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaEventRecord(stop,0)); 
    gpuErrchk(cudaEventSynchronize(stop)); 
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop)); 
    printf("Time to generate (GPU): %3.1f ms\n", elapsedTime); 

    histoCount = 0; 
    for (int i=0; i<256; i++) { 
     histoCount += histo[i]; 
    } 
    printf("Histogram Sum: %ld\n", histoCount); 

    // --- Check the correctness of the results via the host 
    for (int i=0; i<SIZE; i++) histo[buffer[i]]--; 
    for (int i=0; i<256; i++) { 
     if (histo[i] != 0) printf("Failure at %d! Off by %d\n", i, histo[i]); 
    } 

    // --- CUDA THRUST 

    gpuErrchk(cudaEventRecord(start,0)); 

    // --- Wrapping dev_buffer raw pointer with a device_ptr and initializing a device_vector with it 
    thrust::device_ptr<unsigned char> dev_ptr(dev_buffer); 
    thrust::device_vector<unsigned char> dev_buffer_thrust(dev_ptr, dev_ptr + SIZE); 

    // --- Sorting data to bring equal elements together 
    thrust::sort(dev_buffer_thrust.begin(), dev_buffer_thrust.end()); 

    // - The number of histogram bins is equal to the maximum value plus one 
    int num_bins = dev_buffer_thrust.back() + 1; 

    // --- Resize histogram storage 
    thrust::device_vector<int> d_histogram; 
    d_histogram.resize(num_bins); 

    // --- Find the end of each bin of values 
    thrust::counting_iterator<int> search_begin(0); 
    thrust::upper_bound(dev_buffer_thrust.begin(), dev_buffer_thrust.end(), 
        search_begin, search_begin + num_bins, 
        d_histogram.begin()); 

    // --- Compute the histogram by taking differences of the cumulative histogram 
    thrust::adjacent_difference(d_histogram.begin(), d_histogram.end(), 
          d_histogram.begin()); 

    thrust::host_vector<int> h_histogram(d_histogram); 
    gpuErrchk(cudaEventRecord(stop,0)); 
    gpuErrchk(cudaEventSynchronize(stop)); 
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop)); 
    printf("Time to generate (GPU): %3.1f ms\n", elapsedTime); 

    histoCount = 0; 
    for (int i=0; i<256; i++) { 
     histoCount += h_histogram[i]; 
    } 
    printf("Histogram Sum: %ld\n", histoCount); 

    // --- Check the correctness of the results via the host 
    for (int i=0; i<SIZE; i++) h_histogram[buffer[i]]--; 
    for (int i=0; i<256; i++) { 
     if (h_histogram[i] != 0) printf("Failure at %d! Off by %d\n", i, h_histogram[i]); 
    } 

    gpuErrchk(cudaEventDestroy(start)); 
    gpuErrchk(cudaEventDestroy(stop)); 
    gpuErrchk(cudaFree(dev_histo)); 
    gpuErrchk(cudaFree(dev_buffer)); 

    free(buffer); 

    getchar(); 

}

來源

2014-06-01 22:06:59 JackOLantern

：你好，並感謝您的代碼。我想知道你是否可以提供案例（histo_kernel2），我們使用2維的線程和blocks.int y = threadIdx.y + blockIdx.y * blockDim.y; int x = threadIdx.x + blockIdx.x * blockDim.x;。然後，偏移量是？還有，我們將使用temp [globalID]而不是temp [threadIdx.x]？並且在while語句中？ while（globalID George

@George我會盡力爲這個週末提供2D案例。對不起，但現在有點困難:-) – JackOLantern

：當然沒問題！非常感謝您的幫助和時間，真的！ – George

繼喬治的意見，這個答案我與其中的粒子不上線奠定了2D情況進行處理，但在飛機上的一部分。

實際上，2D情況只需要對上面提供的代碼進行微小的修改。唯一要做的是定義粒子的x和y座標以及2D，256 x 256直方圖。

下面的代碼提供了一個完整的工作示例。

#include <thrust/device_vector.h> 
#include <thrust/sort.h> 
#include <thrust/generate.h> 
#include <thrust/adjacent_difference.h> 
#include <thrust/binary_search.h> 

#define SIZE (100*1024*1024) // 100 MB 

/********************/ 
/* CUDA ERROR CHECK */ 
/********************/ 
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

/**********************************************/ 
/* FUNCTION TO GENERATE RANDOM UNSIGNED CHARS */ 
/**********************************************/ 
unsigned char* big_random_block(int size) { 
    unsigned char *data = (unsigned char*)malloc(size); 
    for (int i=0; i<size; i++) 
     data[i] = rand(); 
    return data; 
} 

/********************************/ 
/* GPU HISTOGRAM CALCULATION 2D */ 
/********************************/ 
__global__ void histo_kernel2(unsigned char *dev_x_coord, unsigned char *dev_y_coord, unsigned int *histo, unsigned int size) { 

    // --- The number of threads does not cover all the data size 
    int i = threadIdx.x + blockIdx.x * blockDim.x; 
    int offset = blockDim.x * gridDim.x; 
    while (i < size) 
    { 
     atomicAdd(&histo[dev_y_coord[i] * 256 + dev_x_coord[i]], 1); 
     i += offset; 
    } 
} 

/********/ 
/* MAIN */ 
/********/ 
void main(){ 

    // --- Generating x- and y- coordinates of the particles 
    unsigned char *x_coord = (unsigned char*)big_random_block(SIZE); 
    unsigned char *y_coord = (unsigned char*)big_random_block(SIZE); 

    /********************/ 
    /* CPU COMPUTATIONS */ 
    /********************/ 

    // --- Allocating host memory space and initializing the host-side histogram 
    unsigned int *histo = (unsigned int*)malloc(256 * 256 * sizeof(unsigned int)); 
    for (int i=0; i < 256 * 256; i++) histo [i] = 0; 

    clock_t start_CPU, stop_CPU; 

    // --- Histogram calculation on the host 
    start_CPU = clock(); 
    for (int i=0; i < SIZE; i++) histo[y_coord[i] * 256 + x_coord[i]]++; 
    stop_CPU = clock(); 
    float elapsedTime = (float)(stop_CPU - start_CPU)/(float)CLOCKS_PER_SEC * 1000.0f; 
    printf("Time to generate (CPU): %3.1f ms\n", elapsedTime); 

    // --- Indirect check of the result 
    long histoCount = 0; 
    for (int i=0; i < 256 * 256; i++) { histoCount += histo[i]; } 
    printf("Histogram Sum: %ld\n", histoCount); 

    /********************/ 
    /* GPU COMPUTATIONS */ 
    /********************/ 

    // --- Initializing the device-side data 
    unsigned char *dev_x_coord, *dev_y_coord; 
    gpuErrchk(cudaMalloc((void**)&dev_x_coord,SIZE)); 
    gpuErrchk(cudaMalloc((void**)&dev_y_coord,SIZE)); 
    gpuErrchk(cudaMemcpy(dev_x_coord, x_coord, SIZE, cudaMemcpyHostToDevice)); 
    gpuErrchk(cudaMemcpy(dev_y_coord, y_coord, SIZE, cudaMemcpyHostToDevice)); 

    // --- Allocating device memory space for the device-side histogram 
    unsigned int *dev_histo; 
    gpuErrchk(cudaMalloc((void**)&dev_histo,256*256*sizeof(unsigned int))); 

    // --- GPU timing 
    cudaEvent_t start, stop; 
    gpuErrchk(cudaEventCreate(&start)); 
    gpuErrchk(cudaEventCreate(&stop)); 

    // --- ATOMICS 
    gpuErrchk(cudaEventRecord(start,0)); 
    gpuErrchk(cudaMemset(dev_histo,0,256*256*sizeof(unsigned int))); 
    cudaDeviceProp prop; 
    gpuErrchk(cudaGetDeviceProperties(&prop,0)); 

    int blocks = prop.multiProcessorCount; 
    histo_kernel2<<<blocks*2,256>>>(dev_x_coord, dev_y_coord, dev_histo, SIZE); 
    gpuErrchk(cudaPeekAtLastError()); 
    gpuErrchk(cudaDeviceSynchronize()); 

    gpuErrchk(cudaMemcpy(histo, dev_histo, 256 * 256 * sizeof(unsigned int),cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaEventRecord(stop,0)); 
    gpuErrchk(cudaEventSynchronize(stop)); 
    gpuErrchk(cudaEventElapsedTime(&elapsedTime,start,stop)); 
    printf("Time to generate (GPU): %3.1f ms\n", elapsedTime); 

    histoCount = 0; 
    for (int i=0; i<256 * 256; i++) { 
     histoCount += histo[i]; 
    } 
    printf("Histogram Sum: %ld\n", histoCount); 

    // --- Check the correctness of the results via the host 
    for (int i=0; i<SIZE; i++) histo[y_coord[i] * 256 + x_coord[i]]--; 
    for (int i=0; i<256*256; i++) { 
     if (histo[i] != 0) printf("Failure at %d! Off by %d\n", i, histo[i]); 
    } 

}

來源

2014-11-30 22:20:53 JackOLantern

：非常感謝！使用dev_x_coord和dev_y.I想到我以前的評論，這是可能的嗎？我的評論是否正確？再次感謝！ – George

CUDA中的粒子細胞計數（一維和二維直方圖生成）

回答

相關問題