2017-02-02 104 views
1

我實現了一個最低限度減少使用CUDA 8按照this很好的解釋和修改它CUDA減少最小值和指數

__inline__ __device__ int warpReduceMin(int val) 
{ 
    for (int offset = warpSize/2; offset > 0; offset /= 2) 
    { 
     int tmpVal = __shfl_down(val, offset); 
     if (tmpVal < val) 
     { 
      val = tmpVal; 
     } 
    } 
    return val; 
} 

__inline__ __device__ int blockReduceMin(int val) 
{ 

    static __shared__ int shared[32]; // Shared mem for 32 partial mins 
    int lane = threadIdx.x % warpSize; 
    int wid = threadIdx.x/warpSize; 

    val = warpReduceMin(val);  // Each warp performs partial reduction 

    if (lane == 0) 
    { 
     shared[wid] = val; // Write reduced value to shared memory 
    } 

    __syncthreads();    // Wait for all partial reductions 

    //read from shared memory only if that warp existed 
    val = (threadIdx.x < blockDim.x/warpSize) ? shared[lane] : INT_MAX; 

    if (wid == 0) 
    { 
     val = warpReduceMin(val); //Final reduce within first warp 
    } 

    return val; 
} 

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) { 
    int minVal = INT_MAX; 
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; 
     i < N; 
     i += blockDim.x * gridDim.x) 
    { 
     minVal = min(minVal, in[i]); 
    } 
    minVal = blockReduceMin(minVal); 
    if (threadIdx.x == 0) 
    { 
     atomicMin(out, minVal); 
    } 
} 

,它的偉大工程,我得到的最小值。但是,我不關心最小值,只關於原始輸入數組中的索引。

我想修改我的代碼有點

__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx 
{ 
    for (int offset = warpSize/2; offset > 0; offset /= 2) 
    { 
     int tmpVal = __shfl_down(val, offset); 
     if (tmpVal < val) 
     { 
      *idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here 
      val = tmpVal; 
     } 
    } 
    return val; 
} 

... 
blockReduceMin stayed the same only adding idx to function calls 
... 

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) { 
    int minVal = INT_MAX; 
    int minIdx = 0; // Added this 
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; 
     i < N; 
     i += blockDim.x * gridDim.x) 
    { 
     if (in[i] < minVal) 
     { 
      minVal = in[i]; 
      minIdx = i; // Added this 
     } 
    } 
    minVal = blockReduceMin(minVal, &minIdx); 
    if (threadIdx.x == 0) 
    { 
     int old = atomicMin(out, minVal); 
     if (old != minVal) // value was updated 
     { 
      atomicExch(out + 1, minIdx); 
     } 
    } 
} 

但它不工作。我覺得我錯過了一些重要的東西,但這不是我們要做的,但是我的搜索沒有結果。

+0

[this](http://stackoverflow.com/questions/38176136/finding-minimum-value-in-array-and-its-index-using-cuda-shfl-down-function)可能是感興趣的 –

回答

3

這裏有幾個問題。每次找到新的局部最小值時,都需要修改warp和block最小函數以傳播最小值和索引。也許是這樣的:

__inline__ __device__ void warpReduceMin(int& val, int& idx) 
{ 
    for (int offset = warpSize/2; offset > 0; offset /= 2) { 
     int tmpVal = __shfl_down(val, offset); 
     int tmpIdx = __shfl_down(idx, offset); 
     if (tmpVal < val) { 
      val = tmpVal; 
      idx = tmpIdx; 
     } 
    } 
} 

__inline__ __device__ void blockReduceMin(int& val, int& idx) 
{ 

    static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins 
    int lane = threadIdx.x % warpSize; 
    int wid = threadIdx.x/warpSize; 

    warpReduceMin(val, idx);  // Each warp performs partial reduction 

    if (lane == 0) { 
     values[wid] = val; // Write reduced value to shared memory 
     indices[wid] = idx; // Write reduced value to shared memory 
    } 

    __syncthreads();    // Wait for all partial reductions 

    //read from shared memory only if that warp existed 
    if (threadIdx.x < blockDim.x/warpSize) { 
     val = values[lane]; 
     idx = indices[lane]; 
    } else { 
     val = INT_MAX; 
     idx = 0; 
    } 

    if (wid == 0) { 
     warpReduceMin(val, idx); //Final reduce within first warp 
    } 
} 

[注:寫在瀏覽器中,從來沒有編譯或測試,使用風險自擔]

這應該離開每塊保持它的正確當地最低和索引。那麼你有第二個問題。此:

int old = atomicMin(out, minVal); 
if (old != minVal) // value was updated 
{ 
    atomicExch(out + 1, minIdx); 
} 

已損壞。無法保證在此代碼中正確設置最小值及其索引。這是因爲不能保證兩個原子操作都有同步,並且存在潛在的競爭,其中一個塊可能正確地覆蓋另一個塊的最小值,但是其索引將被其替換的塊覆蓋。這裏唯一的解決方案是某種互斥體,或者對每個塊的結果運行第二個縮減內核。