radix select using cuda

我一直在努力開發一個使用CUDA的基數選擇，它利用k最小的元素對給定數量的元素進行排序。這個基數選擇背後的主要思想是從MSB到LSB開始掃描32位整數。它將左側的所有0位和右側的所有1位分區。遞歸地求解包含k個最小元素的一側。我的分區過程工作得很好，但我遇到遞歸函數調用的問題。我無法停止遞歸。請幫助我！我的內核函數是這樣的：這是kernel.h當radix select using cuda

#include "header.h" 
    #define WARP_SIZE 32 
    #define BLOCK_SIZE 32 

__device__ int Partition(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit) 
{ 
int threadID = threadIdx.x + BLOCK_SIZE * blockIdx.x; 
int WarpID = threadID >> 5; 
int LocWarpID = threadID - 32 * WarpID; 
int NumWarps = N/WARP_SIZE; 
int pivot; 

__shared__ int DataPartition[BLOCK_SIZE]; 
__shared__ int DataBinary[WARP_SIZE]; 

for(int i = 0; i < NumWarps; i++) 
{ 
    if(LocWarpID >= firstidx && LocWarpID <=lastidx) 
    { 
     int r = d_DataIn[i * WARP_SIZE + LocWarpID]; 
     int p = (r>>(31-bit))&1; 
     unsigned int B = __ballot(p); 
     unsigned int B_flip = ~B; 
     if(p==1) 
     { 
      int b = B << (32-LocWarpID); 
      int RightLoc = __popc(b); 
      DataPartition[lastidx - RightLoc] = r; 
     } 
     else 
     { 
      int b_flip = B_flip << (32 - LocWarpID); 
      int LeftLoc = __popc(b_flip); 
      DataPartition[LeftLoc] = r; 
     } 

    if(LocWarpID <= lastidx - __popc(B)) 
     { 

      d_DataIn[LocWarpID] = DataPartition[LocWarpID]; 
     } 
    else 
     { 

      d_DataIn[LocWarpID] = DataPartition[LocWarpID]; 
     } 
     pivot = lastidx - __popc(B); 
     return pivot+1; 
    } 

} 
} 


__device__ int RadixSelect(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit) 
{ 

if(firstidx == lastidx) 
    return *d_DataIn; 
int q = Partition(d_DataIn, firstidx, lastidx, k, N, bit); 
int length = q - firstidx; 
if(k == length) 
    return *d_DataIn; 
else if(k < length) 
    return RadixSelect(d_DataIn, firstidx, q-1, k, N, bit+1); 
else 
    return RadixSelect(d_DataIn, q, lastidx, k-length, N, bit+1); 
} 

__global__ void radix(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit) 
{ 
RadixSelect(d_DataIn, firstidx, lastidx, k, N, bit); 
}

主機代碼main.cu，它看起來像：頭

#include "header.h" 
#include <iostream> 
#include <fstream> 
#include "kernel.h" 

#define BLOCK_SIZE 32 
using namespace std; 

int main() 
{ 
int N = 32; 
thrust::host_vector<float>h_HostFloat(N); 
thrust::counting_iterator <unsigned int> Numbers(0); 
thrust::transform(Numbers, Numbers + N, h_HostFloat.begin(),   RandomFloatNumbers(1.f, 100.f)); 
thrust::host_vector<int>h_HostInt(N); 
thrust::transform(h_HostFloat.begin(), h_HostFloat.end(), h_HostInt.begin(), FloatToInt()); 
thrust::device_vector<float>d_DeviceFloat = h_HostFloat; 
thrust::device_vector<int>d_DeviceInt(N); 
thrust::transform(d_DeviceFloat.begin(), d_DeviceFloat.end(), d_DeviceInt.begin(), FloatToInt()); 
int *d_DataIn = thrust::raw_pointer_cast(d_DeviceInt.data()); 
int *h_DataOut; 
float *h_DataOut1; 
int fsize = N * sizeof(float); 
int size = N * sizeof(int); 
h_DataOut = new int[size]; 
h_DataOut1 = new float[fsize]; 

int firstidx = 0; 
int lastidx = BLOCK_SIZE-1; 
int k = 20; 
int bit = 1; 
int NUM_BLOCKS = N/BLOCK_SIZE; 
radix <<< NUM_BLOCKS, BLOCK_SIZE >>> (d_DataIn, firstidx, lastidx, k, N, bit); 
cudaMemcpy(h_DataOut, d_DataIn, size, cudaMemcpyDeviceToHost); 
WriteData(h_DataOut1, h_DataOut, 10, N); 


return 0; 
    }

名單，我用：

#include "cuda.h" 
    #include "cuda_runtime_api.h" 
    #include "device_launch_parameters.h" 
    #include <thrust/host_vector.h> 
    #include <thrust/device_vector.h> 
    #include <thrust/transform.h> 
    #include <thrust/generate.h> 
    #include "functor.h" 
    #include <thrust/iterator/counting_iterator.h> 
    #include <thrust/copy.h> 
    #include <thrust/device_ptr.h>

將浮點數轉換爲int類型並生成隨機浮點數的另一個頭文件「functor.h」。

#include <thrust/random.h> 
    #include <sstream> 
    #include <fstream> 
    #include <iomanip> 
    struct RandomFloatNumbers 
    { 
float a, b; 
__host__ __device__ 
    RandomFloatNumbers(float _a, float _b) : a(_a), b(_b) {}; 
__host__ __device__ 
    float operator() (const unsigned int n) const{ 
     thrust::default_random_engine rng; 
     thrust::uniform_real_distribution<float> dist(a,b); 
     rng.discard(n); 
     return dist(rng); 
} 

    }; 

    struct FloatToInt 
    { 
__host__ __device__ 
    int operator() (const float &x) 
const { 
    union { 
     float f_value; 
     int i_value; 
    } value; 

    value.f_value = x; 
    return value.i_value; 
} 
    }; 

    float IntToFloat(int &x) 
    { 
    union{ 
      float f_value; 
      int i_value; 
     }value; 

     value.i_value = x; 
     return value.f_value; 
    } 

    bool WriteData(float *h_DataOut1, int *h_DataOut, int bit, int N) 
    { 
std::ofstream data; 
std::stringstream file; 
file << "out\\Partition_"; 
file << std::setfill('0') <<std::setw(2) << bit; 
file << ".txt"; 
data.open((file.str()).c_str()); 
if(data.is_open() == false) 
{ 
    std::cout << "File is not open" << std::endl; 
    return false; 
} 

    for(int i = 0; i < N; i++) 
    { 

     h_DataOut1[i] = IntToFloat(h_DataOut[i]); 
     //cout << h_HostFloat[i] << " \t" << h_DataOut1[i] << endl; 
     //std::bitset<32>bitshift(h_DataOut[i]&1<<31-bit); 
     //data << bitshift[31-bit] << "\t" <<h_DataOut1[i] <<std::endl; 
     data << h_DataOut1[i] << std::endl; 
    } 
    data << std::endl; 
data.close(); 
std::cout << "Partition=" <<bit <<"\n"; 
return true; 
    }

來源

2013-11-26 STam

您能否提供完整應用程序的代碼？包含主機代碼，'main'等，這樣我就可以複製，粘貼和編譯並運行代碼，而無需添加任何內容或更改任何內容。 –

嗨！我已將上面的全部代碼都包含在內了。請幫我弄清楚我哪裏出錯了。 – STam

你的'functor.h'不能正確編譯。它缺少返回語句和結束大括號。我不確定還有什麼遺漏。 –

根據您的要求，我發佈了用於調查此問題的代碼並幫助我學習您的代碼。

#include <stdio.h> 
#include <stdlib.h> 

__device__ int gpu_partition(unsigned int *data, unsigned int *partition, unsigned int *ones, unsigned int* zeroes, int bit, int idx, unsigned int* warp_ones){ 
    int one = 0; 
    int valid = 0; 
    int my_one, my_zero; 
    if (partition[idx]){ 
    valid = 1; 
    if(data[idx] & (1ULL<<(31-bit))) one=1;} 
    __syncthreads(); 
    if (valid){ 
    if (one){ 
     my_one=1; 
     my_zero=0;} 
    else{ 
     my_one=0; 
     my_zero=1;} 
    } 
    else{ 
    my_one=0; 
    my_zero=0;} 
    ones[idx]=my_one; 
    zeroes[idx]=my_zero; 
    unsigned int warp_one = __popc(__ballot(my_one)); 
    if (!(threadIdx.x & 31)) 
    warp_ones[threadIdx.x>>5] = warp_one; 
    __syncthreads(); 
    // reduce 
    for (int i = 16; i > 0; i>>=1){ 
    if (threadIdx.x < i) 
     warp_ones[threadIdx.x] += warp_ones[threadIdx.x + i]; 
    __syncthreads();} 
    return warp_ones[0]; 
} 

__global__ void gpu_radixkernel(unsigned int *data, unsigned int m, unsigned int n, unsigned int *result){ 
    __shared__ unsigned int loc_data[1024]; 
    __shared__ unsigned int loc_ones[1024]; 
    __shared__ unsigned int loc_zeroes[1024]; 
    __shared__ unsigned int loc_warp_ones[32]; 
    int l=0; 
    int bit = 0; 
    unsigned int u = n; 
    if (n<2){ 
    if ((n == 1) && !(threadIdx.x)) *result = data[0]; 
    return;} 
    loc_data[threadIdx.x] = data[threadIdx.x]; 
    loc_ones[threadIdx.x] = (threadIdx.x<n)?1:0; 
    __syncthreads(); 
    unsigned int *next = loc_ones; 
    do { 
    int s = gpu_partition(loc_data, next, loc_ones, loc_zeroes, bit++, threadIdx.x, loc_warp_ones); 
    if ((u-s) > m){ 
     u = (u-s); 
     next = loc_zeroes;} 
    else{ 
     l = (u-s); 
     next = loc_ones;}} 
    while ((u != l) && (bit<32)); 
    if (next[threadIdx.x]) *result = loc_data[threadIdx.x]; 
} 
int partition(unsigned int *data, int l, int u, int bit){ 
    unsigned int *temp = (unsigned int *)malloc(((u-l)+1)*sizeof(unsigned int)); 
    int pos = 0; 
    for (int i = l; i<=u; i++) 
    if(data[i] & (1ULL<<(31-bit))) temp[pos++] = data[i]; 
    int result = u-pos; 
    for (int i = l; i<=u; i++) 
    if(!(data[i] & (1ULL<<(31-bit)))) temp[pos++] = data[i]; 
    pos = 0; 
    for (int i = u; i>=l; i--) 
    data[i] = temp[pos++]; 
    free(temp); 
    return result; 
} 

unsigned int radixselect(unsigned int *data, int l, int u, int m, int bit){ 

    if (l == u) return(data[l]); 
    if (bit > 32) {printf("radixselect fail!\n"); return 0;} 
    int s = partition(data, l, u, bit); 
    if (s>=m) return radixselect(data, l, s, m, bit+1); 
    return radixselect(data, s+1, u, m, bit+1); 
} 

int main(){ 

    unsigned int data[8] = {32767, 22, 88, 44, 99, 101, 0, 7}; 
    unsigned int data1[8]; 

    for (int i = 0; i<8; i++){ 
    for (int j=0; j<8; j++) data1[j] = data[j]; 
    printf("value[%d] = %d\n", i, radixselect(data1, 0, 7, i, 0));} 

    unsigned int *d_data; 
    cudaMalloc((void **)&d_data, 1024*sizeof(unsigned int)); 
    unsigned int h_result, *d_result; 
    cudaMalloc((void **)&d_result, sizeof(unsigned int)); 
    cudaMemcpy(d_data, data, 8*sizeof(unsigned int), cudaMemcpyHostToDevice); 
    for (int i = 0; i < 8; i++){ 
    gpu_radixkernel<<<1,1024>>>(d_data, i, 8, d_result); 
    cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost); 
    printf("gpu result index %d = %d\n", i, h_result); 
    } 

    unsigned int data2[1024]; 
    unsigned int data3[1024]; 

    for (int i = 0; i < 1024; i++) data2[i] = rand(); 
    cudaMemcpy(d_data, data2, 1024*sizeof(unsigned int), cudaMemcpyHostToDevice); 
    for (int i = 0; i < 1024; i++){ 
    for (int j = 0; j<1024; j++) data3[j] = data2[j]; 
    unsigned int cpuresult = radixselect(data3, 0, 1023, i, 0); 
    gpu_radixkernel<<<1,1024>>>(d_data, i, 1024, d_result); 
    cudaMemcpy(&h_result, d_result, sizeof(unsigned int), cudaMemcpyDeviceToHost); 
    if (h_result != cpuresult) {printf("mismatch at index %d, cpu: %d, gpu: %d\n", i, cpuresult, h_result); return 1;} 
    } 
    printf("Finished\n"); 

    return 0; 
}

這裏有一些注意事項，在沒有特定的順序：

我擺脫了所有的推力代碼，它沒有做任何有用的事情儘可能的基數選擇算法關注。我也發現你的鑄造float到int好奇。我沒有想過通過嘗試按順序選擇一個按位順序的指數位序列，然後是尾數位序列。它可能工作，（儘管我認爲如果你包含符號位，它肯定不會起作用），但我認爲這不是理解算法的核心。
我收錄了一個主機版本，我只是爲了檢查設備結果而編寫的。
我很確定這個算法在某些情況下會出現重複元素失敗的情況。例如，如果你把它全部歸零，我認爲它會失敗。但我認爲處理這種情況並不困難。
我的主機版本是遞歸的，但我的設備版本不是。我沒有看到遞歸在這裏很有用，因爲該算法的非遞歸形式也很容易編寫，特別是因爲最多可以傳輸32位數據。儘管如此，如果你想創建一個遞歸設備版本，通過在分區函數中加入u，s和l操作代碼並不困難。
我已經免去了typical cuda error checking。但我推薦它。
我不認爲這是cuda編程的典範。如果您深入瞭解基數排序算法（例如here），您會發現它非常複雜。快速的GPU基數選擇看起來不像我的代碼。我寫了我的代碼，以類似於串行遞歸分區基數排序，這不是在大規模並行體系結構上執行它的最佳方式。
由於基數選擇不是一種排序，我試圖寫一個設備代碼，不會輸入數據的數據移動，因爲我認爲這是昂貴的和不必要的。我在內核開始時從全局內存中讀取數據，然後我全部使用共享內存，甚至在共享內存中，我也不重新安排數據（就像我在主機版本中那樣）），以避免數據移動的成本。相反，我保持1和0分區的標誌數組，以供給下一個分區步驟。數據移動會涉及相當數量的未合併和/或銀行衝突的流量，而標誌陣列允許所有訪問都是非銀行衝突的。

來源

2013-12-02 20:14:46

radix select using cuda

回答

相關問題