你在正確的軌道與thrust :: copy_if。只需分配兩個與第一個相同大小的緩衝區。然後copy_if> 7.0f到第一個,copy_if < = 7.0f到第二個。只要知道有空間,分配與原始緩衝區大小相同的緩衝區就可以了,而一百萬個浮點數只佔用4MB。
編輯:
我做的copy_if
和stable_partition
方法的性能比較。在我的卡上,對於0.1f
,0.5f
和0.9f
的「拆分」值,GTX660,stable_partition
花費了大約150%長達copy_if
。我添加了測試以確保兩個方法都是穩定的(保持值的順序)。
#include <cuda.h>
#include <curand.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/partition.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
return EXIT_FAILURE;}} while(0)
#define SPLIT 0.1f
struct is_low
{
__host__ __device__ bool operator()(const float x)
{
return x <= SPLIT;
}
};
struct is_high
{
__host__ __device__ bool operator()(const float x)
{
return x > SPLIT;
}
};
class EventTimer {
public:
EventTimer() : mStarted(false), mStopped(false) {
cudaEventCreate(&mStart);
cudaEventCreate(&mStop);
}
~EventTimer() {
cudaEventDestroy(mStart);
cudaEventDestroy(mStop);
}
void start(cudaStream_t s = 0) {
cudaEventRecord(mStart, s);
mStarted = true;
mStopped = false;
}
void stop(cudaStream_t s = 0) {
assert(mStarted);
cudaEventRecord(mStop, s);
mStarted = false;
mStopped = true;
}
float elapsed() {
assert(mStopped);
if (!mStopped) return 0;
cudaEventSynchronize(mStop);
float elapsed = 0;
cudaEventElapsedTime(&elapsed, mStart, mStop);
return elapsed;
}
private:
bool mStarted, mStopped;
cudaEvent_t mStart, mStop;
};
int main(int argc, char *argv[])
{
const size_t n = 1024 * 1024 * 50;
// Create prng
curandGenerator_t gen;
CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
// Set seed
CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
// Generate n floats on device
thrust::device_vector<float> vec_rnd_d(n);
float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data());
CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n));
thrust::device_vector<float> vec_low_d(n);
thrust::device_vector<float> vec_high_d(n);
for (int i = 0; i < 5; ++i) {
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_end;
iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low());
thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high());
timer.stop();
std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_low_h = vec_low_d;
thrust::host_vector<float> vec_high_h = vec_high_d;
thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin();
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
for (int i = 0; i < 5; ++i) {
thrust::device_vector<float> vec_rnd_copy = vec_rnd_d;
EventTimer timer;
timer.start();
thrust::device_vector<float>::iterator iter_split =
thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low());
timer.stop();
size_t n_low = iter_split - vec_rnd_copy.begin();
std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl;
// check result
thrust::host_vector<float> vec_rnd_h = vec_rnd_d;
thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy;
thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin();
thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low;
for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin();
rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) {
if (*rnd_iter_h <= SPLIT) {
assert(*low_iter_h == *rnd_iter_h);
++low_iter_h;
}
else {
assert(*high_iter_h == *rnd_iter_h);
++high_iter_h;
}
}
}
CHECK_CURAND_CALL(curandDestroyGenerator(gen));
return EXIT_SUCCESS;
}
輸出:
C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe
copy_if: 40.2919ms
copy_if: 38.0157ms
copy_if: 38.5036ms
copy_if: 37.6751ms
copy_if: 38.1054ms
stable_partition: 59.5473ms
stable_partition: 61.4016ms
stable_partition: 59.1854ms
stable_partition: 61.3195ms
stable_partition: 59.1205ms
哦,哇!似乎兩個'copy_if'操作比一個'stable_partition'快!你能猜到它是爲什麼嗎?感謝代碼,如果它也適用於複雜的數據類型,我會稍後再看。 – fheshwfq
我認爲這是[空間/時間折衷](http://en.wikipedia.org/wiki/Space%E2%80%93time_tradeoff)。 'copy_if'需要兩個與源緩衝區大小相同的額外緩衝區。 'stable_partition'有一個限制,它必須在單個緩衝區內工作,或者它必須在後臺創建緩衝區並將結果複製回源緩衝區。 –