我(經由Cudafy.NET library,雖然我在CUDA/C++方法同樣的興趣)執行在CUDA一些陣列操縱/計算,並且需要計算的最小和數組中的最大值。其中一個內核看起來是這樣的:返回的數組的最小和最大元素CUDA

    public static void UpdateEz(GThread thread, float time, float ca, float cb, float[,] hx, float[,] hy, float[,] ez) 
     var i = thread.blockIdx.x; 
     var j = thread.blockIdx.y; 

     if (i > 0 && i < ez.GetLength(0) - 1 && j > 0 && j < ez.GetLength(1) - 1) 
      ez[i, j] = 
       ca * ez[i, j] 
       + cb * (hx[i, j] - hx[i - 1, j]) 
       + cb * (hy[i, j - 1] - hy[i, j]) 


    public static void UpdateEz(GThread thread, float time, float ca, float cb, float[,] hx, float[,] hy, float[,] ez, out float min, out float max) 
     var i = thread.blockIdx.x; 
     var j = thread.blockIdx.y; 

     min = float.MaxValue; 
     max = float.MinValue; 

     if (i > 0 && i < ez.GetLength(0) - 1 && j > 0 && j < ez.GetLength(1) - 1) 
      ez[i, j] = 
       ca * ez[i, j] 
       + cb * (hx[i, j] - hx[i - 1, j]) 
       + cb * (hy[i, j - 1] - hy[i, j]) 

      min = Math.Min(ez[i, j], min); 
      max = Math.Max(ez[i, j], max); 




傳統上通過縮減操作找到最小值和最大值。我對Cudafy不太熟悉,但這看起來並不像減少。 – alrikai


@alrikai我會很高興地屠殺和補充我的代碼來解決這個問題。我看了一下map/reduce等,但實現有點模糊。忘記Cudafy部分:你如何直接在CUDA/C++中執行它? –


你可以使用'thrust'或'npp'。 – sgarizvi





#define MAX_NEG ... //some small number 

template <typename T, int BLKSZ> __global__ 
void cu_max_reduce(const T* d_data, const int d_len, T* max_val) 
    volatile __shared__ T smem[BLKSZ]; 

    const int tid = threadIdx.x; 
    const int bid = blockIdx.x; 
     //starting index for each block to begin loading the input data into shared memory 
    const int bid_sidx = bid*BLKSZ; 

    //load the input data to smem, with padding if needed. each thread handles 2 elements 
    #pragma unroll 
    for (int i = 0; i < 2; i++) 
       //get the index for the thread to load into shared memory 
     const int tid_idx = 2*tid + i; 
     const int ld_idx = bid_sidx + tid_idx; 
     if(ld_idx < (bid+1)*BLKSZ && ld_idx < d_len) 
      smem[tid_idx] = d_data[ld_idx]; 
      smem[tid_idx] = MAX_NEG; 


    //run the reduction per-block 
    for (unsigned int stride = BLKSZ/2; stride > 0; stride >>= 1) 
     if(tid < stride) 
      smem[tid] = ((smem[tid] > smem[tid + stride]) ? smem[tid]:smem[tid + stride]); 

    //write the per-block result out from shared memory to global memory 
    max_val[bid] = smem[0]; 

//assume we have d_data as a device pointer with our data, of length data_len 
template <typename T> __host__ 
T cu_find_max(const T* d_data, const int data_len) 
    //in your host code, invoke the kernel with something along the lines of: 
    const int thread_per_block = 16; 
    const int elem_per_thread = 2; 
    const int BLKSZ = elem_per_thread*thread_per_block; //number of elements to process per block 
    const int blocks_per_grid = ceil((float)data_len/(BLKSZ)); 

    dim3 block_dim(thread_per_block, 1, 1); 
    dim3 grid_dim(blocks_per_grid, 1, 1); 

    T *d_max; 
    cudaMalloc((void **)&d_max, sizeof(T)*blocks_per_grid); 

    cu_max_reduce <T, BLKSZ> <<<grid_dim, block_dim>>> (d_data, data_len, d_max); 


這會發現每塊的最大值。您可以在其輸出上重新運行它(例如,使用d_max作爲輸入數據和更新的啓動參數)以查找全局最大值 - 以多遍方式運行它,如果數據集太大在這種情況下,大於2 * 4096元素,因爲我們有每個線程處理2個元素,儘管您可以爲每個線程處理更多元素以增加此元素)。




您可以使用divide and conquer方法開發自己的最小/最大算法。




#include <stdio.h> 

#include <cuda_runtime_api.h> 

#include <thrust\pair.h> 
#include <thrust\device_vector.h> 
#include <thrust\extrema.h> 

int main() 
    const int N = 5; 

    const float h_a[N] = { 3., 21., -2., 4., 5. }; 

    float *d_a;  cudaMalloc(&d_a, N * sizeof(float)); 
    cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice); 

    float minel, maxel; 
    thrust::pair<thrust::device_ptr<float>, thrust::device_ptr<float>> tuple; 
    tuple = thrust::minmax_element(thrust::device_pointer_cast(d_a), thrust::device_pointer_cast(d_a) + N); 
    minel = tuple.first[0]; 
    maxel = tuple.second[0]; 

    printf("minelement %f - maxelement %f\n", minel, maxel); 

    return 0; 