推力：有選擇地將元素移動到另一個向量

我試圖找出使用推力做下列事情的最佳方法：向量A有一百萬浮點數，它們有一些特定的順序。我想移動到矢量B的每一個元素X在一種用於其中X> 7.0 使得元素的順序是保持在這兩個矢量A和B.重要的是，只有元件的一小部分需要是移動。效率對我的代碼比優雅更重要。推力：有選擇地將元素移動到另一個向量

我的想法是從A到B使用thrust::copy_if，然後在A上使用thrust::remove_if。但是我不知道要拷貝的元素的確切數量，並且因爲顯然B的存儲器必須事先分配，所以另一個計數操作是必要的。跳過計數操作的優雅方法是爲矢量B預先分配「足夠」的內存。

使用thrust::remove_copy_if有很多相同的問題：您需要事先爲B分配內存，並且它實際上並不實際從A中刪除任何東西，因此無論如何還需要另一個thrust::remove_if。

我的另一個想法是使用thrust::stable_sort與一些定製的比較函數，將我想要的所有元素推送到A的末尾，然後以某種方式找出它們有多少，並且它們對於B也是這樣。看起來相當不雅...

來源

2014-03-13 fheshwfq

你在正確的軌道與thrust :: copy_if。只需分配兩個與第一個相同大小的緩衝區。然後copy_if> 7.0f到第一個，copy_if < = 7.0f到第二個。只要知道有空間，分配與原始緩衝區大小相同的緩衝區就可以了，而一百萬個浮點數只佔用4MB。

編輯：

我做的copy_if和stable_partition方法的性能比較。在我的卡上，對於0.1f,0.5f和0.9f的「拆分」值，GTX660,stable_partition花費了大約150％長達copy_if。我添加了測試以確保兩個方法都是穩定的（保持值的順序）。

#include <cuda.h> 
#include <curand.h> 

#include <thrust/device_vector.h> 
#include <thrust/host_vector.h> 
#include <thrust/copy.h> 
#include <thrust/partition.h> 

#include <iostream> 
#include <cassert> 

#define CHECK_CUDA_CALL(x) do { if((x)!=cudaSuccess) { \ 
    printf("Error at %s:%d\n",__FILE__,__LINE__);\ 
    return EXIT_FAILURE;}} while(0) 


#define CHECK_CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \ 
    printf("Error at %s:%d\n",__FILE__,__LINE__);\ 
    return EXIT_FAILURE;}} while(0) 


#define SPLIT 0.1f 

struct is_low 
{ 
    __host__ __device__ bool operator()(const float x) 
    { 
    return x <= SPLIT; 
    } 
}; 


struct is_high 
{ 
    __host__ __device__ bool operator()(const float x) 
    { 
    return x > SPLIT; 
    } 
}; 


class EventTimer { 
public: 
    EventTimer() : mStarted(false), mStopped(false) { 
    cudaEventCreate(&mStart); 
    cudaEventCreate(&mStop); 
    } 
    ~EventTimer() { 
    cudaEventDestroy(mStart); 
    cudaEventDestroy(mStop); 
    } 
    void start(cudaStream_t s = 0) { 
    cudaEventRecord(mStart, s); 
    mStarted = true; 
    mStopped = false; 
    } 
    void stop(cudaStream_t s = 0) { 
    assert(mStarted); 
    cudaEventRecord(mStop, s); 
    mStarted = false; 
    mStopped = true; 
    } 
    float elapsed() { 
    assert(mStopped); 
    if (!mStopped) return 0; 
    cudaEventSynchronize(mStop); 
    float elapsed = 0; 
    cudaEventElapsedTime(&elapsed, mStart, mStop); 
    return elapsed; 
    } 

private: 
    bool mStarted, mStopped; 
    cudaEvent_t mStart, mStop; 
}; 


int main(int argc, char *argv[]) 
{ 
    const size_t n = 1024 * 1024 * 50; 

    // Create prng 
    curandGenerator_t gen; 
    CHECK_CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT)); 

    // Set seed 
    CHECK_CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL)); 

    // Generate n floats on device 
    thrust::device_vector<float> vec_rnd_d(n); 
    float* ptr_rnd_d = thrust::raw_pointer_cast(vec_rnd_d.data()); 
    CHECK_CURAND_CALL(curandGenerateUniform(gen, ptr_rnd_d, n)); 

    thrust::device_vector<float> vec_low_d(n); 
    thrust::device_vector<float> vec_high_d(n); 

    for (int i = 0; i < 5; ++i) { 
     EventTimer timer; 
     timer.start(); 
     thrust::device_vector<float>::iterator iter_end; 
     iter_end = thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_low_d.begin(), is_low()); 
     thrust::copy_if(vec_rnd_d.begin(), vec_rnd_d.end(), vec_high_d.begin(), is_high()); 
     timer.stop(); 
     std::cout << "copy_if: " << timer.elapsed() << "ms" << std::endl; 

     // check result 
     thrust::host_vector<float> vec_rnd_h = vec_rnd_d; 
     thrust::host_vector<float> vec_low_h = vec_low_d; 
     thrust::host_vector<float> vec_high_h = vec_high_d; 
     thrust::host_vector<float>::iterator low_iter_h = vec_low_h.begin(); 
     thrust::host_vector<float>::iterator high_iter_h = vec_high_h.begin(); 
     for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin(); 
      rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) { 
     if (*rnd_iter_h <= SPLIT) { 
      assert(*low_iter_h == *rnd_iter_h); 
      ++low_iter_h; 
     } 
     else { 
      assert(*high_iter_h == *rnd_iter_h); 
      ++high_iter_h; 
     } 
     } 
    } 

    for (int i = 0; i < 5; ++i) { 
     thrust::device_vector<float> vec_rnd_copy = vec_rnd_d; 
     EventTimer timer; 
     timer.start(); 
     thrust::device_vector<float>::iterator iter_split = 
     thrust::stable_partition(vec_rnd_copy.begin(), vec_rnd_copy.end(), is_low()); 
     timer.stop(); 
     size_t n_low = iter_split - vec_rnd_copy.begin(); 
     std::cout << "stable_partition: " << timer.elapsed() << "ms" << std::endl; 

     // check result 
     thrust::host_vector<float> vec_rnd_h = vec_rnd_d; 
     thrust::host_vector<float> vec_partitioned_h = vec_rnd_copy; 
     thrust::host_vector<float>::iterator low_iter_h = vec_partitioned_h.begin(); 
     thrust::host_vector<float>::iterator high_iter_h = vec_partitioned_h.begin() + n_low; 
     for (thrust::host_vector<float>::iterator rnd_iter_h = vec_rnd_h.begin(); 
      rnd_iter_h != vec_rnd_h.end(); ++rnd_iter_h) { 
     if (*rnd_iter_h <= SPLIT) { 
      assert(*low_iter_h == *rnd_iter_h); 
      ++low_iter_h; 
     } 
     else { 
      assert(*high_iter_h == *rnd_iter_h); 
      ++high_iter_h; 
     } 
     } 
    } 

    CHECK_CURAND_CALL(curandDestroyGenerator(gen)); 

    return EXIT_SUCCESS; 
}

輸出：

C:\rd\projects\cpp\test_cuda\Release>test_cuda.exe 
copy_if: 40.2919ms 
copy_if: 38.0157ms 
copy_if: 38.5036ms 
copy_if: 37.6751ms 
copy_if: 38.1054ms 
stable_partition: 59.5473ms 
stable_partition: 61.4016ms 
stable_partition: 59.1854ms 
stable_partition: 61.3195ms 
stable_partition: 59.1205ms

來源

2014-03-13 21:20:13

哦，哇！似乎兩個'copy_if'操作比一個'stable_partition'快！你能猜到它是爲什麼嗎？感謝代碼，如果它也適用於複雜的數據類型，我會稍後再看。 – fheshwfq

我認爲這是[空間/時間折衷]（http://en.wikipedia.org/wiki/Space%E2%80%93time_tradeoff）。 'copy_if'需要兩個與源緩衝區大小相同的額外緩衝區。 'stable_partition'有一個限制，它必須在單個緩衝區內工作，或者它必須在後臺創建緩衝區並將結果複製回源緩衝區。 –

要回答我的問題，我終於找到thrust::stable_partition，這是比所有更高效，更優雅「copy_if」 -alternatives。它只是將所有未滿足謂詞的元素移動到數組的末尾，並返回第二個序列的開始。指針算術給出了B的大小，但實際上它不再是必要的：

thrust::device_vector<float>::iterator iter = thrust::stable_partition(A.begin(), A.end(), pred) 
thrust::device_vector<float> B(iter, A.end()) 
A.erase(iter, A.end());

來源

2014-03-14 07:28:21 fheshwfq

酷！它絕對看起來更乾淨。我對性能和'copy_if'方法很好奇。 –

推力：有選擇地將元素移動到另一個向量

回答

相關問題