0
我一直在努力開發一個使用CUDA的基數選擇,它利用k最小的元素對給定數量的元素進行排序。這個基數選擇背後的主要思想是從MSB到LSB開始掃描32位整數。它將左側的所有0位和右側的所有1位分區。遞歸地求解包含k個最小元素的一側。我的分區過程工作得很好,但我遇到遞歸函數調用的問題。我無法停止遞歸。請幫助我! 我的內核函數是這樣的:這是kernel.h當radix select using cuda
#include "header.h"
#define WARP_SIZE 32
#define BLOCK_SIZE 32
__device__ int Partition(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
int threadID = threadIdx.x + BLOCK_SIZE * blockIdx.x;
int WarpID = threadID >> 5;
int LocWarpID = threadID - 32 * WarpID;
int NumWarps = N/WARP_SIZE;
int pivot;
__shared__ int DataPartition[BLOCK_SIZE];
__shared__ int DataBinary[WARP_SIZE];
for(int i = 0; i < NumWarps; i++)
{
if(LocWarpID >= firstidx && LocWarpID <=lastidx)
{
int r = d_DataIn[i * WARP_SIZE + LocWarpID];
int p = (r>>(31-bit))&1;
unsigned int B = __ballot(p);
unsigned int B_flip = ~B;
if(p==1)
{
int b = B << (32-LocWarpID);
int RightLoc = __popc(b);
DataPartition[lastidx - RightLoc] = r;
}
else
{
int b_flip = B_flip << (32 - LocWarpID);
int LeftLoc = __popc(b_flip);
DataPartition[LeftLoc] = r;
}
if(LocWarpID <= lastidx - __popc(B))
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
else
{
d_DataIn[LocWarpID] = DataPartition[LocWarpID];
}
pivot = lastidx - __popc(B);
return pivot+1;
}
}
}
__device__ int RadixSelect(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
if(firstidx == lastidx)
return *d_DataIn;
int q = Partition(d_DataIn, firstidx, lastidx, k, N, bit);
int length = q - firstidx;
if(k == length)
return *d_DataIn;
else if(k < length)
return RadixSelect(d_DataIn, firstidx, q-1, k, N, bit+1);
else
return RadixSelect(d_DataIn, q, lastidx, k-length, N, bit+1);
}
__global__ void radix(int *d_DataIn, int firstidx, int lastidx, int k, int N, int bit)
{
RadixSelect(d_DataIn, firstidx, lastidx, k, N, bit);
}
主機代碼main.cu,它看起來像:頭
#include "header.h"
#include <iostream>
#include <fstream>
#include "kernel.h"
#define BLOCK_SIZE 32
using namespace std;
int main()
{
int N = 32;
thrust::host_vector<float>h_HostFloat(N);
thrust::counting_iterator <unsigned int> Numbers(0);
thrust::transform(Numbers, Numbers + N, h_HostFloat.begin(), RandomFloatNumbers(1.f, 100.f));
thrust::host_vector<int>h_HostInt(N);
thrust::transform(h_HostFloat.begin(), h_HostFloat.end(), h_HostInt.begin(), FloatToInt());
thrust::device_vector<float>d_DeviceFloat = h_HostFloat;
thrust::device_vector<int>d_DeviceInt(N);
thrust::transform(d_DeviceFloat.begin(), d_DeviceFloat.end(), d_DeviceInt.begin(), FloatToInt());
int *d_DataIn = thrust::raw_pointer_cast(d_DeviceInt.data());
int *h_DataOut;
float *h_DataOut1;
int fsize = N * sizeof(float);
int size = N * sizeof(int);
h_DataOut = new int[size];
h_DataOut1 = new float[fsize];
int firstidx = 0;
int lastidx = BLOCK_SIZE-1;
int k = 20;
int bit = 1;
int NUM_BLOCKS = N/BLOCK_SIZE;
radix <<< NUM_BLOCKS, BLOCK_SIZE >>> (d_DataIn, firstidx, lastidx, k, N, bit);
cudaMemcpy(h_DataOut, d_DataIn, size, cudaMemcpyDeviceToHost);
WriteData(h_DataOut1, h_DataOut, 10, N);
return 0;
}
名單,我用:
#include "cuda.h"
#include "cuda_runtime_api.h"
#include "device_launch_parameters.h"
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/generate.h>
#include "functor.h"
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
將浮點數轉換爲int類型並生成隨機浮點數的另一個頭文件「functor.h」。
#include <thrust/random.h>
#include <sstream>
#include <fstream>
#include <iomanip>
struct RandomFloatNumbers
{
float a, b;
__host__ __device__
RandomFloatNumbers(float _a, float _b) : a(_a), b(_b) {};
__host__ __device__
float operator() (const unsigned int n) const{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> dist(a,b);
rng.discard(n);
return dist(rng);
}
};
struct FloatToInt
{
__host__ __device__
int operator() (const float &x)
const {
union {
float f_value;
int i_value;
} value;
value.f_value = x;
return value.i_value;
}
};
float IntToFloat(int &x)
{
union{
float f_value;
int i_value;
}value;
value.i_value = x;
return value.f_value;
}
bool WriteData(float *h_DataOut1, int *h_DataOut, int bit, int N)
{
std::ofstream data;
std::stringstream file;
file << "out\\Partition_";
file << std::setfill('0') <<std::setw(2) << bit;
file << ".txt";
data.open((file.str()).c_str());
if(data.is_open() == false)
{
std::cout << "File is not open" << std::endl;
return false;
}
for(int i = 0; i < N; i++)
{
h_DataOut1[i] = IntToFloat(h_DataOut[i]);
//cout << h_HostFloat[i] << " \t" << h_DataOut1[i] << endl;
//std::bitset<32>bitshift(h_DataOut[i]&1<<31-bit);
//data << bitshift[31-bit] << "\t" <<h_DataOut1[i] <<std::endl;
data << h_DataOut1[i] << std::endl;
}
data << std::endl;
data.close();
std::cout << "Partition=" <<bit <<"\n";
return true;
}
您能否提供完整應用程序的代碼?包含主機代碼,'main'等,這樣我就可以複製,粘貼和編譯並運行代碼,而無需添加任何內容或更改任何內容。 –
嗨!我已將上面的全部代碼都包含在內了。請幫我弄清楚我哪裏出錯了。 – STam
你的'functor.h'不能正確編譯。它缺少返回語句和結束大括號。我不確定還有什麼遺漏。 –