在CUDA中按鍵排序3個陣列（也許使用Thrust）

我有3相同大小的數組（大於300.000元素）。一組浮點數和兩個索引數組。所以，對於每個號碼我都有2 ID。所有的3數組已經在GPU全局內存中。我想相應地將所有數字與他們的ID進行排序。在CUDA中按鍵排序3個陣列（也許使用Thrust）

有什麼辦法可以使用Thrust庫來完成這個任務嗎？有沒有比Thrust庫更好的方法？

當然，我不喜歡將它們複製到主機內存和從主機內存複製幾次。順便說一句，他們是數組而不是矢量。

感謝您的幫助提前。

初步的解決方案，但這是極其緩慢。它需要近4秒和我的數組大小在300000

thrust::device_ptr<float> keys(afterSum); 
thrust::device_ptr<int> vals0(d_index); 
thrust::device_ptr<int> vals1(blockId); 

thrust::device_vector<int> sortedIndex(numElements); 
thrust::device_vector<int> sortedBlockId(numElements); 

thrust::counting_iterator<int> iter(0); 
thrust::device_vector<int> indices(numElements); 
thrust::copy(iter, iter + indices.size(), indices.begin()); 

thrust::sort_by_key(keys, keys + numElements , indices.begin());  

thrust::gather(indices.begin(), indices.end(), vals0, sortedIndex.begin()); 
thrust::gather(indices.begin(), indices.end(), vals1, sortedBlockId.begin()); 

thrust::host_vector<int> h_sortedIndex=sortedIndex; 
thrust::host_vector<int> h_sortedBlockId=sortedBlockId;

來源

2011-07-07 Kiarash

當然你也可以使用推力秩序。首先，您需要將原始CUDA設備指針與thrust::device_ptr一起打包。假設你的浮點值數組pkeys，且該ID是在陣列pvals0和pvals1，並且包含numElements爲數組的長度，這樣的事情應該工作：

#include <thrust/device_ptr.h> 
#include <thrust/sort.h> 
#include <thrust/gather.h> 
#include <thrust/iterator/counting_iterator.h> 

cudaEvent_t start, stop; 
cudaEventCreate(&start); 
cudaEventCreate(&stop); 

cudaEventRecord(start); 

thrust::device_ptr<float> keys(pkeys); 
thrust::device_ptr<int> vals0(pvals0); 
thrust::device_ptr<int> vals1(pvals1); 

// allocate space for the output 
thrust::device_vector<int> sortedVals0(numElements); 
thrust::device_vector<int> sortedVals1(numElements); 

// initialize indices vector to [0,1,2,..] 
thrust::counting_iterator<int> iter(0); 
thrust::device_vector<int> indices(numElements); 
thrust::copy(iter, iter + indices.size(), indices.begin()); 

// first sort the keys and indices by the keys 
thrust::sort_by_key(keys.begin(), keys.end(), indices.begin()); 

// Now reorder the ID arrays using the sorted indices 
thrust::gather(indices.begin(), indices.end(), vals0.begin(), sortedVals0.begin()); 
thrust::gather(indices.begin(), indices.end(), vals1.begin(), sortedVals1.begin()); 

cudaEventRecord(stop); 
cudaEventSynchronize(stop); 
float milliseconds = 0; 
cudaEventElapsedTime(&milliseconds, start, stop); 
printf("Took %f milliseconds for %d elements\n", milliseconds, numElements);

來源

2011-07-08 00:43:24 harrism

感謝哈里斯。我使用了幾乎精確的代碼。除了我更改了pkeys，pvals和numElements。我收到很多錯誤，我把它們放在問題部分。我試圖弄清楚。 – Kiarash

我發現如何解決這個問題，但現在它非常緩慢。我能做些什麼？ – Kiarash

我把工作代碼放在問題部分！ – Kiarash

我會用zip_iterator執行一個同時在兩個指示向量上進行sort_by_key。

這應該是這樣的：

typedef typename thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> IteratorTuple; 
    typedef typename thrust::zip_iterator<IteratorTuple> ZipIterator; 

    // here I suppose your 3 arrays are pointed to by device_ptr as suggested by @harrism 
    thrust::device_vector<float> key(pKey, pKey + numElements); 
    thrust::device_vector<int> val0(pVal0, pVal0 + numElements); 
    thrust::device_vector<int> val1(pVal1, pVal1 + numElements); 

    ZipIterator iterBegin(thrust::make_tuple(val0.begin(), val1.begin())); 
    thrust::sort_by_key(key.begin(), key.end(), iterBegin);

來源

2015-08-10 12:23:32 Tikoloche

我比較以上提出的，即這兩種方法，即使用thrust::zip_iterator和使用thrust::gather。按照海報的要求，在按鍵或三個數組排序兩個數組的情況下，我測試了它們。在所有這兩種情況下，使用thrust::gather的方法顯示出更快。

2 ARRAYS

#include <time.h>  // --- time 
#include <stdlib.h>  // --- srand, rand 

#include <thrust\host_vector.h> 
#include <thrust\device_vector.h> 
#include <thrust\sort.h> 
#include <thrust\iterator\zip_iterator.h> 

#include "TimingGPU.cuh" 

//#define VERBOSE 
//#define COMPACT 

int main() { 

    const int N = 1048576; 
    //const int N = 10; 

    TimingGPU timerGPU; 

    // --- Initialize random seed 
    srand(time(NULL)); 

    thrust::host_vector<int> h_code(N); 
    thrust::host_vector<double> h_x(N); 
    thrust::host_vector<double> h_y(N); 

    for (int k = 0; k < N; k++) {  
     // --- Generate random numbers between 0 and 9 
     h_code[k] = rand() % 10 + 1; 
     h_x[k] = ((double)rand()/(RAND_MAX)); 
     h_y[k] = ((double)rand()/(RAND_MAX)); 
    } 

    thrust::device_vector<int> d_code(h_code); 

    thrust::device_vector<double> d_x(h_x); 
    thrust::device_vector<double> d_y(h_y); 

#ifdef VERBOSE 
    printf("Before\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 

    timerGPU.StartCounter(); 
#ifdef COMPACT 
    thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin()))); 
#else 

    // --- Initialize indices vector to [0,1,2,..] 
    thrust::counting_iterator<int> iter(0); 
    thrust::device_vector<int> indices(N); 
    thrust::copy(iter, iter + indices.size(), indices.begin()); 

    // --- First, sort the keys and indices by the keys 
    thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin()); 

    // Now reorder the ID arrays using the sorted indices 
    thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin()); 
    thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin()); 
#endif 

    printf("Timing GPU = %f\n", timerGPU.GetCounter()); 

#ifdef VERBOSE 
    h_code = d_code; 
    h_x = d_x; 
    h_y = d_y; 

    printf("After\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 
}

3 ARRAYS

#include <time.h>  // --- time 
#include <stdlib.h>  // --- srand, rand 

#include <thrust\host_vector.h> 
#include <thrust\device_vector.h> 
#include <thrust\sort.h> 
#include <thrust\iterator\zip_iterator.h> 

#include "TimingGPU.cuh" 

//#define VERBOSE 
//#define COMPACT 

int main() { 

    const int N = 1048576; 
    //const int N = 10; 

    TimingGPU timerGPU; 

    // --- Initialize random seed 
    srand(time(NULL)); 

    thrust::host_vector<int> h_code(N); 
    thrust::host_vector<double> h_x(N); 
    thrust::host_vector<double> h_y(N); 
    thrust::host_vector<double> h_z(N); 

    for (int k = 0; k < N; k++) { 
     // --- Generate random numbers between 0 and 9 
     h_code[k] = rand() % 10 + 1; 
     h_x[k] = ((double)rand()/(RAND_MAX)); 
     h_y[k] = ((double)rand()/(RAND_MAX)); 
     h_z[k] = ((double)rand()/(RAND_MAX)); 
    } 

    thrust::device_vector<int> d_code(h_code); 

    thrust::device_vector<double> d_x(h_x); 
    thrust::device_vector<double> d_y(h_y); 
    thrust::device_vector<double> d_z(h_z); 

#ifdef VERBOSE 
    printf("Before\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 

    timerGPU.StartCounter(); 
#ifdef COMPACT 
    thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin(), d_z.begin()))); 
#else 

    // --- Initialize indices vector to [0,1,2,..] 
    thrust::counting_iterator<int> iter(0); 
    thrust::device_vector<int> indices(N); 
    thrust::copy(iter, iter + indices.size(), indices.begin()); 

    // --- First, sort the keys and indices by the keys 
    thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin()); 

    // Now reorder the ID arrays using the sorted indices 
    thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin()); 
    thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin()); 
    thrust::gather(indices.begin(), indices.end(), d_z.begin(), d_z.begin()); 
#endif 

    printf("Timing GPU = %f\n", timerGPU.GetCounter()); 

#ifdef VERBOSE 
    h_code = d_code; 
    h_x = d_x; 
    h_y = d_y; 

    printf("After\n"); 
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]); 
#endif 
}

定時在2陣列的情況下的情況下爲N = 1048576

爲例

zip_iterator = 7.34ms 
gather  = 4.27ms

在3陣列的情況下時序的NVIDIA GTX 960卡上執行N = 1048576

zip_iterator = 9.64ms 
gather  = 4.22ms

試驗。

來源

2017-02-27 11:30:49 JackOLantern

在CUDA中按鍵排序3個陣列（也許使用Thrust）

回答

相關問題