CUDA推力性能

我有一個包含一組數字的640 * 480向量，我希望找到向量的每一行的最小和最大數目。CUDA推力性能

for(int i = 0; i < R; i++) 
    { 
     Begin = m_valBuffer.begin() + (i*C); 
     End = Begin+C; 

     rMinmax= minmax_element(Begin, End); 
    }

但是，這是非常緩慢的，有沒有什麼辦法可以加快這一點？

GPU上運行時的當前負載只有34％，所以必須有一種方法來改善它？

來源

2011-07-27 Sharpie

所以，你想要做640個減少，每隻有480元，在同一個音調線性內存？ – talonmies

我想對640個元素做480次minmax_element。上面的循環每次迭代移動開始和結束640個元素。 – Sharpie

整個操作可以在基本的NVIDIA SDK共享內存縮減內核的一個稍微修改版本的單個調用中完成。你絕對**有**這樣做與推力？ – talonmies

This example示出了如何計算使用的reduce_by_key算法的每一行的總和。您可以輕鬆地調整該示例以計算每行的最小值或最大值。要同時計算每行的最小值和最大值，您需要使用this strategy。具體來說，您需要在輸入數據上使用transform_iterator，並在應用minmax_binary_op歸約算子之前將每個值x轉換爲元組(x,x)。

這裏有一個完整的例子：

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/generate.h> 
#include <thrust/transform_reduce.h> 
#include <thrust/functional.h> 
#include <thrust/extrema.h> 
#include <thrust/random.h> 
#include <iostream> 
#include <iomanip> 

// minmax_pair stores the minimum and maximum 
// values that have been encountered so far 
template <typename T> 
struct minmax_pair 
{ 
    T min_val; 
    T max_val; 
}; 

// minmax_unary_op is a functor that takes in a value x and 
// returns a minmax_pair whose minimum and maximum values 
// are initialized to x. 
template <typename T> 
struct minmax_unary_op 
    : public thrust::unary_function< T, minmax_pair<T> > 
{ 
    __host__ __device__ 
    minmax_pair<T> operator()(const T& x) const 
    { 
    minmax_pair<T> result; 
    result.min_val = x; 
    result.max_val = x; 
    return result; 
    } 
}; 

// minmax_binary_op is a functor that accepts two minmax_pair 
// structs and returns a new minmax_pair whose minimum and 
// maximum values are the min() and max() respectively of 
// the minimums and maximums of the input pairs 
template <typename T> 
struct minmax_binary_op 
    : public thrust::binary_function< minmax_pair<T>, 
            minmax_pair<T>, 
            minmax_pair<T> > 
{ 
    __host__ __device__ 
    minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const 
    { 
    minmax_pair<T> result; 
    result.min_val = thrust::min(x.min_val, y.min_val); 
    result.max_val = thrust::max(x.max_val, y.max_val); 
    return result; 
    } 
}; 

// convert a linear index to a row index 
template <typename T> 
struct linear_index_to_row_index : public thrust::unary_function<T,T> 
{ 
    T C; // number of columns 

    __host__ __device__ 
    linear_index_to_row_index(T C) : C(C) {} 

    __host__ __device__ 
    T operator()(T i) 
    { 
     return i/C; 
    } 
}; 

int main(void) 
{ 
    int R = 5;  // number of rows 
    int C = 8;  // number of columns 
    thrust::default_random_engine rng; 
    thrust::uniform_int_distribution<int> dist(0, 99); 

    // initialize data 
    thrust::device_vector<int> array(R * C); 
    for (size_t i = 0; i < array.size(); i++) 
     array[i] = dist(rng); 

    // allocate storage for per-row results and indices 
    thrust::device_vector< minmax_pair<int> > row_results(R); 
    thrust::device_vector< int    > row_indices(R); 

    // compute row sums by summing values with equal row indices 
    thrust::reduce_by_key 
     (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)), 
     thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)) + (R*C), 
     thrust::make_transform_iterator(array.begin(), minmax_unary_op<int>()), 
     row_indices.begin(), 
     row_results.begin(), 
     thrust::equal_to<int>(), 
     minmax_binary_op<int>()); 

    // print data 
    for(int i = 0; i < R; i++) 
    { 
     minmax_pair<int> result = row_results[i]; 
     std::cout << "["; 
     for(int j = 0; j < C; j++) 
      std::cout << std::setw(3) << array[i * C + j] << " "; 
     std::cout << "] = " << "(" << result.min_val << "," << result.max_val << ")\n"; 
    } 

    return 0; 
}

輸出示例：

[ 0 8 60 89 96 18 51 39 ] = (0,96) 
[ 26 74 8 56 58 80 59 51 ] = (8,80) 
[ 87 99 72 96 29 42 89 65 ] = (29,99) 
[ 90 96 16 85 90 29 93 41 ] = (16,96) 
[ 30 51 39 78 68 54 59 9 ] = (9,78)

來源

2011-07-27 17:38:51 wnbell

我不太清楚你如何獨立完成每一行，但同時使用這種方法。你能提供更多細節嗎？ – harrism

我添加了一些代碼來顯示我的想法。我所做的只是採用'sum_rows'示例，並將values參數（即'array.begin（）'）更改爲'transform_iterator'，將'x'擴展爲'minmax_pair'（實質上是一個元組）。 – wnbell

回答

相關問題