2011-07-27 71 views
2

我有一個包含一組數字的640 * 480向量,我希望找到向量的每一行的最小和最大數目。CUDA推力性能

for(int i = 0; i < R; i++) 
    { 
     Begin = m_valBuffer.begin() + (i*C); 
     End = Begin+C; 

     rMinmax= minmax_element(Begin, End); 
    } 

但是,這是非常緩慢的,有沒有什麼辦法可以加快這一點?

  • GPU上運行時的當前負載只有34%,所以必須有一種方法來改善它?
+0

所以,你想要做640個減少,每隻有480元,在同一個音調線性內存? – talonmies

+0

我想對640個元素做480次minmax_element。上面的循環每次迭代移動開始和結束640個元素。 – Sharpie

+1

整個操作可以在基本的NVIDIA SDK共享內存縮減內核的一個稍微修改版本的單個調用中完成。你絕對**有**這樣做與推力? – talonmies

回答

7

This example示出了如何計算使用 的reduce_by_key算法的每一行的總和。您可以輕鬆地調整該示例以計算每行的最小值或最大值。要同時計算每行 的最小值和最大值,您需要使用this strategy。具體來說, 您需要在輸入數據上使用transform_iterator,並在應用minmax_binary_op歸約算子之前將每個 值x轉換爲元組(x,x)

這裏有一個完整的例子:

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/generate.h> 
#include <thrust/transform_reduce.h> 
#include <thrust/functional.h> 
#include <thrust/extrema.h> 
#include <thrust/random.h> 
#include <iostream> 
#include <iomanip> 

// minmax_pair stores the minimum and maximum 
// values that have been encountered so far 
template <typename T> 
struct minmax_pair 
{ 
    T min_val; 
    T max_val; 
}; 

// minmax_unary_op is a functor that takes in a value x and 
// returns a minmax_pair whose minimum and maximum values 
// are initialized to x. 
template <typename T> 
struct minmax_unary_op 
    : public thrust::unary_function< T, minmax_pair<T> > 
{ 
    __host__ __device__ 
    minmax_pair<T> operator()(const T& x) const 
    { 
    minmax_pair<T> result; 
    result.min_val = x; 
    result.max_val = x; 
    return result; 
    } 
}; 

// minmax_binary_op is a functor that accepts two minmax_pair 
// structs and returns a new minmax_pair whose minimum and 
// maximum values are the min() and max() respectively of 
// the minimums and maximums of the input pairs 
template <typename T> 
struct minmax_binary_op 
    : public thrust::binary_function< minmax_pair<T>, 
            minmax_pair<T>, 
            minmax_pair<T> > 
{ 
    __host__ __device__ 
    minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const 
    { 
    minmax_pair<T> result; 
    result.min_val = thrust::min(x.min_val, y.min_val); 
    result.max_val = thrust::max(x.max_val, y.max_val); 
    return result; 
    } 
}; 

// convert a linear index to a row index 
template <typename T> 
struct linear_index_to_row_index : public thrust::unary_function<T,T> 
{ 
    T C; // number of columns 

    __host__ __device__ 
    linear_index_to_row_index(T C) : C(C) {} 

    __host__ __device__ 
    T operator()(T i) 
    { 
     return i/C; 
    } 
}; 

int main(void) 
{ 
    int R = 5;  // number of rows 
    int C = 8;  // number of columns 
    thrust::default_random_engine rng; 
    thrust::uniform_int_distribution<int> dist(0, 99); 

    // initialize data 
    thrust::device_vector<int> array(R * C); 
    for (size_t i = 0; i < array.size(); i++) 
     array[i] = dist(rng); 

    // allocate storage for per-row results and indices 
    thrust::device_vector< minmax_pair<int> > row_results(R); 
    thrust::device_vector< int    > row_indices(R); 

    // compute row sums by summing values with equal row indices 
    thrust::reduce_by_key 
     (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)), 
     thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)) + (R*C), 
     thrust::make_transform_iterator(array.begin(), minmax_unary_op<int>()), 
     row_indices.begin(), 
     row_results.begin(), 
     thrust::equal_to<int>(), 
     minmax_binary_op<int>()); 

    // print data 
    for(int i = 0; i < R; i++) 
    { 
     minmax_pair<int> result = row_results[i]; 
     std::cout << "["; 
     for(int j = 0; j < C; j++) 
      std::cout << std::setw(3) << array[i * C + j] << " "; 
     std::cout << "] = " << "(" << result.min_val << "," << result.max_val << ")\n"; 
    } 

    return 0; 
} 

輸出示例:

[ 0 8 60 89 96 18 51 39 ] = (0,96) 
[ 26 74 8 56 58 80 59 51 ] = (8,80) 
[ 87 99 72 96 29 42 89 65 ] = (29,99) 
[ 90 96 16 85 90 29 93 41 ] = (16,96) 
[ 30 51 39 78 68 54 59 9 ] = (9,78) 
+0

我不太清楚你如何獨立完成每一行,但同時使用這種方法。你能提供更多細節嗎? – harrism

+0

我添加了一些代碼來顯示我的想法。我所做的只是採用'sum_rows'示例,並將values參數(即'array.begin()')更改爲'transform_iterator',將'x'擴展爲'minmax_pair'(實質上是一個元組)。 – wnbell