我比較以上提出的,即這兩種方法,即使用thrust::zip_iterator
和使用thrust::gather
。按照海報的要求,在按鍵或三個數組排序兩個數組的情況下,我測試了它們。在所有這兩種情況下,使用thrust::gather
的方法顯示出更快。
2
ARRAYS
#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>
#include "TimingGPU.cuh"
//#define VERBOSE
//#define COMPACT
int main() {
const int N = 1048576;
//const int N = 10;
TimingGPU timerGPU;
// --- Initialize random seed
srand(time(NULL));
thrust::host_vector<int> h_code(N);
thrust::host_vector<double> h_x(N);
thrust::host_vector<double> h_y(N);
for (int k = 0; k < N; k++) {
// --- Generate random numbers between 0 and 9
h_code[k] = rand() % 10 + 1;
h_x[k] = ((double)rand()/(RAND_MAX));
h_y[k] = ((double)rand()/(RAND_MAX));
}
thrust::device_vector<int> d_code(h_code);
thrust::device_vector<double> d_x(h_x);
thrust::device_vector<double> d_y(h_y);
#ifdef VERBOSE
printf("Before\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
timerGPU.StartCounter();
#ifdef COMPACT
thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin())));
#else
// --- Initialize indices vector to [0,1,2,..]
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(N);
thrust::copy(iter, iter + indices.size(), indices.begin());
// --- First, sort the keys and indices by the keys
thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin());
// Now reorder the ID arrays using the sorted indices
thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin());
thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin());
#endif
printf("Timing GPU = %f\n", timerGPU.GetCounter());
#ifdef VERBOSE
h_code = d_code;
h_x = d_x;
h_y = d_y;
printf("After\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
}
3
ARRAYS
#include <time.h> // --- time
#include <stdlib.h> // --- srand, rand
#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>
#include "TimingGPU.cuh"
//#define VERBOSE
//#define COMPACT
int main() {
const int N = 1048576;
//const int N = 10;
TimingGPU timerGPU;
// --- Initialize random seed
srand(time(NULL));
thrust::host_vector<int> h_code(N);
thrust::host_vector<double> h_x(N);
thrust::host_vector<double> h_y(N);
thrust::host_vector<double> h_z(N);
for (int k = 0; k < N; k++) {
// --- Generate random numbers between 0 and 9
h_code[k] = rand() % 10 + 1;
h_x[k] = ((double)rand()/(RAND_MAX));
h_y[k] = ((double)rand()/(RAND_MAX));
h_z[k] = ((double)rand()/(RAND_MAX));
}
thrust::device_vector<int> d_code(h_code);
thrust::device_vector<double> d_x(h_x);
thrust::device_vector<double> d_y(h_y);
thrust::device_vector<double> d_z(h_z);
#ifdef VERBOSE
printf("Before\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
timerGPU.StartCounter();
#ifdef COMPACT
thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin(), d_z.begin())));
#else
// --- Initialize indices vector to [0,1,2,..]
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(N);
thrust::copy(iter, iter + indices.size(), indices.begin());
// --- First, sort the keys and indices by the keys
thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin());
// Now reorder the ID arrays using the sorted indices
thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin());
thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin());
thrust::gather(indices.begin(), indices.end(), d_z.begin(), d_z.begin());
#endif
printf("Timing GPU = %f\n", timerGPU.GetCounter());
#ifdef VERBOSE
h_code = d_code;
h_x = d_x;
h_y = d_y;
printf("After\n");
for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
}
定時在2
陣列的情況下的情況下爲N = 1048576
爲例
zip_iterator = 7.34ms
gather = 4.27ms
在3
陣列的情況下時序的NVIDIA GTX 960卡上執行N = 1048576
zip_iterator = 9.64ms
gather = 4.22ms
試驗。
感謝哈里斯。我使用了幾乎精確的代碼。除了我更改了pkeys,pvals和numElements。我收到很多錯誤,我把它們放在問題部分。我試圖弄清楚。 – Kiarash
我發現如何解決這個問題,但現在它非常緩慢。我能做些什麼? – Kiarash
我把工作代碼放在問題部分! – Kiarash