使用CUDA在數組及其索引中查找最小值__shfl_down函數

我正在編寫一個函數，該函數將使用CUDA查找找到值的一維數組的最小值和索引。使用CUDA在數組及其索引中查找最小值__shfl_down函數

我通過修改用於查找1d數組中值的總和的約簡代碼開始。代碼適用於求和函數，但我無法找到最小值的工作。如果有任何cuda大師，請在郵件中附上代碼，請指出我正在做的錯誤。

實際功能在下面，在測試示例中，數組大小爲1024.因此，我正在使用shuffle reduction部分。問題是每個塊的輸出值在g_oIdxs（給出索引），而每個塊的g_odata（給出最小值）與普通的順序CPU代碼相比是錯誤的。

當我在主機上打印時，g_odata中的值也全爲零（0）。

提前致謝！

#include <stdio.h> 
#include <stdlib.h> 
#include <cuda.h> 
#include <math.h> 
#include <unistd.h> 
#include <sys/time.h> 

#if __DEVICE_EMULATION__ 
#define DEBUG_SYNC __syncthreads(); 
#else 
#define DEBUG_SYNC 
#endif 

#ifndef MIN 
#define MIN(x,y) ((x < y) ? x : y) 
#endif 

#ifndef MIN_IDX 
#define MIN_IDX(x,y, idx_x, idx_y) ((x < y) ? idx_x : idx_y) 
#endif 

#if (__CUDA_ARCH__ < 200) 
#define int_mult(x,y) __mul24(x,y) 
#else 
#define int_mult(x,y) x*y 
#endif 

#define inf 0x7f800000 

bool isPow2(unsigned int x) 
{ 
    return ((x&(x-1))==0); 
} 

unsigned int nextPow2(unsigned int x) 
{ 
    --x; 
    x |= x >> 1; 
    x |= x >> 2; 
    x |= x >> 4; 
    x |= x >> 8; 
    x |= x >> 16; 
    return ++x; 
} 

// Utility class used to avoid linker errors with extern 
// unsized shared memory arrays with templated type 
template<class T> 
struct SharedMemory { 
    __device__ inline operator T *() { 
     extern __shared__ int __smem[]; 
     return (T *) __smem; 
    } 

    __device__ inline operator const T *() const { 
     extern __shared__ int __smem[]; 
     return (T *) __smem; 
    } 
}; 

// specialize for double to avoid unaligned memory 
// access compile errors 
template<> 
struct SharedMemory<double> { 
    __device__ inline operator double *() { 
     extern __shared__ double __smem_d[]; 
     return (double *) __smem_d; 
    } 

    __device__ inline operator const double *() const { 
     extern __shared__ double __smem_d[]; 
     return (double *) __smem_d; 
    } 
}; 

/* 
This version adds multiple elements per thread sequentially. This reduces the overall 
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). 
(Brent's Theorem optimization) 

Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory. 
In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. 
If blockSize > 32, allocate blockSize*sizeof(T) bytes. 
*/ 
template<class T, unsigned int blockSize, bool nIsPow2> 
__global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) { 
    T *sdata = SharedMemory<T>(); 

    // perform first level of reduction, 
    // reading from global memory, writing to shared memory 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 

    T mySum = 0; 

    // we reduce multiple elements per thread. The number is determined by the 
    // number of active thread blocks (via gridDim). More blocks will result 
    // in a larger gridSize and therefore fewer elements per thread 
    while (i < n) { 
     mySum += g_idata[i]; 

     // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays 
     if (nIsPow2 || i + blockSize < n) 
      mySum += g_idata[i + blockSize]; 

     i += gridSize; 
    } 

    // each thread puts its local sum into shared memory 
    sdata[tid] = mySum; 
    __syncthreads(); 

    // do reduction in shared mem 
    if ((blockSize >= 512) && (tid < 256)) { 
     sdata[tid] = mySum = mySum + sdata[tid + 256]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 256) && (tid < 128)) { 
     sdata[tid] = mySum = mySum + sdata[tid + 128]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 128) && (tid < 64)) { 
     sdata[tid] = mySum = mySum + sdata[tid + 64]; 
    } 

    __syncthreads(); 

#if (__CUDA_ARCH__ >= 300) 
    if (tid < 32) { 
     // Fetch final intermediate sum from 2nd warp 
     if (blockSize >= 64) 
      mySum += sdata[tid + 32]; 
     // Reduce final warp using shuffle 
     for (int offset = warpSize/2; offset > 0; offset /= 2) { 
      mySum += __shfl_down(mySum, offset); 
     } 
    } 
#else 
    // fully unroll reduction within a single warp 
    if ((blockSize >= 64) && (tid < 32)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 32]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 32) && (tid < 16)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 16]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 16) && (tid < 8)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 8]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 8) && (tid < 4)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 4]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 4) && (tid < 2)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 2]; 
    } 

    __syncthreads(); 

    if ((blockSize >= 2) && (tid < 1)) 
    { 
     sdata[tid] = mySum = mySum + sdata[tid + 1]; 
    } 

    __syncthreads(); 
#endif 

    // write result for this block to global mem 
    if (tid == 0) 
     g_odata[blockIdx.x] = mySum; 
} 


/* 
This version adds multiple elements per thread sequentially. This reduces the overall 
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n). 
(Brent's Theorem optimization) 

Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory. 
In other words if blockSize <= 32, allocate 64*sizeof(T) bytes. 
If blockSize > 32, allocate blockSize*sizeof(T) bytes. 
*/ 
template<class T, unsigned int blockSize, bool nIsPow2> 
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata, int *g_oIdxs, unsigned int n) { 
    T *sdata = SharedMemory<T>(); 

    int *sdataIdx = SharedMemory<int>(); 

    // perform first level of reduction, 
    // reading from global memory, writing to shared memory 
    unsigned int tid = threadIdx.x; 
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x; 
    unsigned int gridSize = blockSize * 2 * gridDim.x; 


    T myMin = 99999; 
    int myMinIdx = -1; 
    // we reduce multiple elements per thread. The number is determined by the 
    // number of active thread blocks (via gridDim). More blocks will result 
    // in a larger gridSize and therefore fewer elements per thread 
    while (i < n) { 
     myMinIdx = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx); 
     myMin = MIN(g_idata[i], myMin); 

     // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays 
     if (nIsPow2 || i + blockSize < n){ 
      //myMin += g_idata[i + blockSize]; 
      myMinIdx = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx); 
      myMin = MIN(g_idata[i + blockSize], myMin); 
     } 

     i += gridSize; 
    } 

    // each thread puts its local sum into shared memory 
    sdata[tid] = myMin; 
    sdataIdx[tid] = myMinIdx; 
    __syncthreads(); 

    // do reduction in shared mem 
    if ((blockSize >= 512) && (tid < 256)) { 
     //sdata[tid] = mySum = mySum + sdata[tid + 256]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 256], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 256) && (tid < 128)) { 
     //sdata[tid] = myMin = myMin + sdata[tid + 128]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 128], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 128) && (tid < 64)) { 
     //sdata[tid] = myMin = myMin + sdata[tid + 64]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 64], myMin); 
    } 

    __syncthreads(); 

#if (__CUDA_ARCH__ >= 300) 
    if (tid < 32) { 
     // Fetch final intermediate sum from 2nd warp 
     if (blockSize >= 64){ 
     //myMin += sdata[tid + 32]; 
      myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx); 
      myMin = MIN(sdata[tid + 32], myMin); 
     } 
     // Reduce final warp using shuffle 
     for (int offset = warpSize/2; offset > 0; offset /= 2) { 
      //myMin += __shfl_down(myMin, offset); 
      int tempMyMinIdx = __shfl_down(myMinIdx, offset); 
      float tempMyMin = __shfl_down(myMin, offset); 

      myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx); 
      myMin = MIN(tempMyMin, myMin); 
     } 

    } 
#else 
    // fully unroll reduction within a single warp 
    if ((blockSize >= 64) && (tid < 32)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 32]; 
     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 32], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 32) && (tid < 16)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 16]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 16], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 16) && (tid < 8)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 8]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 8], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 8) && (tid < 4)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 4]; 

     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 4], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 4) && (tid < 2)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 2]; 
     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 2], myMin); 
    } 

    __syncthreads(); 

    if ((blockSize >= 2) && (tid < 1)) 
    { 
     //sdata[tid] = myMin = myMin + sdata[tid + 1]; 
     sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx); 
     sdata[tid] = myMin = MIN(sdata[tid + 1], myMin); 
    } 

    __syncthreads(); 
#endif 

    __syncthreads(); 
    // write result for this block to global mem 
    if (tid == 0){ 
     g_odata[blockIdx.x] = myMin; 
     g_oIdxs[blockIdx.x] = myMinIdx; 
    } 
} 

//////////////////////////////////////////////////////////////////////////////// 
// Compute the number of threads and blocks to use for the given reduction kernel 
// For the kernels >= 3, we set threads/block to the minimum of maxThreads and 
// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel 
// 6, we observe the maximum specified number of blocks, because each thread in 
// that kernel can process a variable number of elements. 
//////////////////////////////////////////////////////////////////////////////// 
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, 
     int maxThreads, int &blocks, int &threads) { 

    //get device capability, to avoid block/grid size exceed the upper bound 
    cudaDeviceProp prop; 
    int device; 
    cudaGetDevice(&device); 
    cudaGetDeviceProperties(&prop, device); 

    if (whichKernel < 3) { 
     threads = (n < maxThreads) ? nextPow2(n) : maxThreads; 
     blocks = (n + threads - 1)/threads; 
    } else { 
     threads = (n < maxThreads * 2) ? nextPow2((n + 1)/2) : maxThreads; 
     blocks = (n + (threads * 2 - 1))/(threads * 2); 
    } 

    if ((float) threads * blocks 
      > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) { 
     printf("n is too large, please choose a smaller number!\n"); 
    } 

    if (blocks > prop.maxGridSize[0]) { 
     printf(
       "Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n", 
       blocks, prop.maxGridSize[0], threads * 2, threads); 

     blocks /= 2; 
     threads *= 2; 
    } 

    if (whichKernel == 6) { 
     blocks = MIN(maxBlocks, blocks); 
    } 
} 

//////////////////////////////////////////////////////////////////////////////// 
// Wrapper function for kernel launch 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata, 
     T *d_odata) { 
    dim3 dimBlock(threads, 1, 1); 
    dim3 dimGrid(blocks, 1, 1); 

    // when there is only one warp per block, we need to allocate two warps 
    // worth of shared memory so that we don't index shared memory out of bounds 
    int smemSize = 
      (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); 

    if (isPow2(size)) { 
     switch (threads) { 
     case 512: 
      reduce6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 256: 
      reduce6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 128: 
      reduce6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 64: 
      reduce6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 32: 
      reduce6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 16: 
      reduce6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 8: 
      reduce6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 4: 
      reduce6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 2: 
      reduce6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 1: 
      reduce6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 
     } 
    } else { 
     switch (threads) { 
     case 512: 
      reduce6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 256: 
      reduce6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 128: 
      reduce6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 64: 
      reduce6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 32: 
      reduce6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 16: 
      reduce6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 8: 
      reduce6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 4: 
      reduce6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 2: 
      reduce6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 

     case 1: 
      reduce6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, 
        d_odata, size); 
      break; 
     } 
    } 

} 


//////////////////////////////////////////////////////////////////////////////// 
// Wrapper function for kernel launch 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
void reduceMin(int size, int threads, int blocks, int whichKernel, T *d_idata, 
     T *d_odata, int *idxs, int *oIdxs) { 
    dim3 dimBlock(threads, 1, 1); 
    dim3 dimGrid(blocks, 1, 1); 

    // when there is only one warp per block, we need to allocate two warps 
    // worth of shared memory so that we don't index shared memory out of bounds 
    int smemSize = 
      (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); 

    if (isPow2(size)) { 
     switch (threads) { 
     case 512: 
      reduceMin6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 256: 
      reduceMin6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 128: 
      reduceMin6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 64: 
      reduceMin6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 32: 
      reduceMin6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 16: 
      reduceMin6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 8: 
      reduceMin6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 4: 
      reduceMin6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 2: 
      reduceMin6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 1: 
      reduceMin6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 
     } 
    } else { 
     switch (threads) { 
     case 512: 
      reduceMin6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 256: 
      reduceMin6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 128: 
      reduceMin6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 64: 
      reduceMin6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 32: 
      reduceMin6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 16: 
      reduceMin6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 8: 
      reduceMin6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 4: 
      reduceMin6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 2: 
      reduceMin6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 

     case 1: 
      reduceMin6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs, 
        d_odata, oIdxs, size); 
      break; 
     } 
    } 

} 

//////////////////////////////////////////////////////////////////////////////// 
//! Compute sum reduction on CPU 
//! We use Kahan summation for an accurate sum of large arrays. 
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm 
//! 
//! @param data  pointer to input data 
//! @param size  number of input data elements 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
void reduceMINCPU(T *data, int size, T *min, int *idx) 
{ 
    *min = data[0]; 
    int min_idx = 0; 
    T c = (T)0.0; 

    for (int i = 1; i < size; i++) 
    { 
     T y = data[i]; 
     T t = MIN(*min, y); 
     min_idx = MIN_IDX(*min, y, min_idx, i); 
     (*min) = t; 
    } 

    *idx = min_idx; 

    return; 
} 

//////////////////////////////////////////////////////////////////////////////// 
//! Compute sum reduction on CPU 
//! We use Kahan summation for an accurate sum of large arrays. 
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm 
//! 
//! @param data  pointer to input data 
//! @param size  number of input data elements 
//////////////////////////////////////////////////////////////////////////////// 
template<class T> 
T reduceCPU(T *data, int size) 
{ 
    T sum = data[0]; 
    T c = (T)0.0; 

    for (int i = 1; i < size; i++) 
    { 
     T y = data[i] - c; 
     T t = sum + y; 
     c = (t - sum) - y; 
     sum = t; 
    } 

    return sum; 
} 

// Instantiate the reduction function for 3 types 
template void 
reduce<int>(int size, int threads, int blocks, int whichKernel, int *d_idata, 
     int *d_odata); 

template void 
reduce<float>(int size, int threads, int blocks, int whichKernel, 
     float *d_idata, float *d_odata); 

template void 
reduce<double>(int size, int threads, int blocks, int whichKernel, 
     double *d_idata, double *d_odata); 

// Instantiate the reduction function for 3 types 
template void 
reduceMin<int>(int size, int threads, int blocks, int whichKernel, int *d_idata, 
     int *d_odata, int *idxs, int *oIdxs); 

template void 
reduceMin<float>(int size, int threads, int blocks, int whichKernel, float *d_idata, 
     float *d_odata, int *idxs, int *oIdxs); 

template void 
reduceMin<double>(int size, int threads, int blocks, int whichKernel, double *d_idata, 
     double *d_odata, int *idxs, int *oIdxs); 

unsigned long long int my_min_max_test(int num_els) { 

    // timers 

    unsigned long long int start; 
    unsigned long long int delta; 

    int maxThreads = 256; // number of threads per block 
    int whichKernel = 6; 
    int maxBlocks = 64; 

    int testIterations = 100; 



    float* d_in = NULL; 
    float* d_out = NULL; 
    int *d_idxs = NULL; 
    int *d_oIdxs = NULL; 

    printf("%d elements\n", num_els); 
    printf("%d threads (max)\n", maxThreads); 

    int numBlocks = 0; 
    int numThreads = 0; 
    getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks, 
      numThreads); 



    // in[1024] = 34.0f; 
    // in[333] = 55.0f; 
    // in[23523] = -42.0f; 

// cudaMalloc((void**) &d_in, size); 
// cudaMalloc((void**) &d_out, size); 
// cudaMalloc((void**) &d_idxs, num_els * sizeof(int)); 

    cudaMalloc((void **) &d_in, num_els * sizeof(float)); 
    cudaMalloc((void **) &d_idxs, num_els * sizeof(int)); 
    cudaMalloc((void **) &d_out, numBlocks * sizeof(float)); 
    cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int)); 

    float* in = (float*) malloc(num_els * sizeof(float)); 
    int *idxs = (int*) malloc(num_els * sizeof(int)); 
    float* out = (float*) malloc(numBlocks * sizeof(float)); 
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int)); 

    for (int i = 0; i < num_els; i++) { 
     in[i] = (double) rand()/(double) RAND_MAX; 
     idxs[i] = i; 
    } 

    for (int i = 0; i < num_els; i++) { 

     printf("\n [%d] = %.4f", idxs[i], in[i]); 
    } 

    // copy data directly to device memory 
    cudaMemcpy(d_in, in, num_els * sizeof(float), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice); 
    cudaMemcpy(d_out, out, numBlocks * sizeof(float),cudaMemcpyHostToDevice); 
    cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice); 

    // warm-up 
// reduce<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out); 
// 
// cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); 
// 
// for(int i=0; i< numBlocks; i++) 
// printf("\nFinal Result[BLK:%d]: %f", i, out[i]); 

// printf("\n Reduce CPU : %f", reduceCPU<float>(in, num_els)); 

    reduceMin<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs); 

    cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); 
    cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost); 

    for(int i=0; i< numBlocks; i++) 
     printf("\n Reduce MIN GPU idx: %d value: %f", oIdxs[i], out[i]); 


    int min_idx; 
    float min; 
    reduceMINCPU<float>(in, num_els, &min, &min_idx); 

    printf("\n\n Reduce MIN CPU idx: %d value: %f", min_idx, min); 

    cudaFree(d_in); 
    cudaFree(d_out); 
    cudaFree(d_idxs); 

    free(in); 
    free(out); 

    //system("pause"); 

    return delta; 

} 

int main(int argc, char* argv[]) { 

    printf(" GTS250 @ 70.6 GB/s - Finding min and max"); 
    printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n"); 

//#pragma unroll 
//for(int i = 1024*1024; i <= 32*1024*1024; i=i*2) 
//{ 
// my_min_max_test(i); 
//} 

    printf("\n Non-base 2 tests! \n"); 
    printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n"); 

    my_min_max_test(1024); 



// just some large numbers.... 
//my_min_max_test(14*1024*1024+38); 
//my_min_max_test(14*1024*1024+55); 
//my_min_max_test(18*1024*1024+1232); 
//my_min_max_test(7*1024*1024+94854); 

//for(int i = 0; i < 4; i++) 
//{ 
// 
// float ratio = float(rand())/float(RAND_MAX); 
// ratio = ratio >= 0 ? ratio : -ratio; 
// int big_num = ratio*18*1e6; 
// 
// my_min_max_test(big_num); 
//} 

    return 0; 
}

來源

2016-07-04 Vinay

請始終發佈[MCVE];你是否必須使用「原始CUDA」，或者你是否也可以使用其他方法（如推力）？ –

這段代碼將成爲我並行化的更大算法的一部分。所以，細粒度的並行性很重要。此代碼在7.5代碼示例中出現，只是從求和變爲最小化。 – Vinay

這是不使用動態分配的共享存儲器的有效方法：

T *sdata = SharedMemory<T>(); 

int *sdataIdx = SharedMemory<int>();

這些指針（sdata和sdataIdx）的兩個最終將指向相同位置。（documentation討論如何正確處理這個問題。）

即使您現在想要使用兩倍的共享內存（用於存儲最小加索引），您仍未增加動態分配大小先前的和還原僅代碼：

int smemSize = 
     (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);

在你的情況，threads是256，你是分配足夠的（threads*sizeof(T)）中最低的存儲，但沒有用於索引存儲。

我們可以「解決」這些問題，通過進行以下修改：

T *sdata = SharedMemory<T>(); 

int *sdataIdx = ((int *)sdata) + blockSize;

和：

int smemSize = 
     (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T); 
smemSize += threads*sizeof(int);

這些變化，你的代碼產生預期的輸出：

$ cuda-memcheck ./t1182 
========= CUDA-MEMCHECK 
GTS250 @ 70.6 GB/s - Finding min and max 
N [GB/s] [perc] [usec] test 

Non-base 2 tests! 

N [GB/s] [perc] [usec] test 
1024 elements 
256 threads (max) 

Reduce MIN GPU idx: 484 value: 0.001125 
Reduce MIN GPU idx: 972 value: 0.002828 

Reduce MIN CPU idx: 484 value: 0.001125 
========= ERROR SUMMARY: 0 errors 
$

（我註釋了打印出所有1024個初始值的循環）

來源

2016-07-04 21:53:37

謝謝！那麼多..當我讀到你的回答時，我正在調試的階段，在那裏我意識到某處是從全局內存中讀取併爲共享內存分配值之後。但是要花更長的時間來解決實際的錯誤。 – Vinay

使用CUDA在數組及其索引中查找最小值__shfl_down函數

回答

相關問題