用CUDA減少矩陣列

我有一個矩陣，我想用CUDA並以最快的方式計算列方式的平均值（歸結爲總和），即返回一個包含平均值的行向量該矩陣中的每一列。用於計算單個列向量之和的和減少的實現看起來像這樣：用CUDA減少矩陣列

template<typename T> 
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) { 
    extern __shared__ T sdata[]; 

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x; 

    // load input into __shared__ memory 
    T x = 0.0; 
    if (tid < n) { 
     x = input[tid]; 
    } 
    sdata[threadIdx.x] = x; 
    __syncthreads(); 

    // contiguous range pattern 
    for(int offset = blockDim.x/2; offset > 0; offset >>= 1) { 
     if(threadIdx.x < offset) { 
      // add a partial sum upstream to our own 
      sdata[threadIdx.x] += sdata[threadIdx.x + offset]; 
     } 
     // wait until all threads in the block have 
     // updated their partial sums 
     __syncthreads(); 
    } 

    // thread 0 writes the final result 
    if(threadIdx.x == 0) { 
     per_block_results[blockIdx.x] = sdata[0]; 
    } 
}

並且這被援引作爲：

int n = ... // vector size 
const int BLOCK_SIZE = 1024; 
int number_of_blocks = (n + BLOCK_SIZE - 1)/BLOCK_SIZE; 
double* per_block_results = NULL; 
cudaMalloc((void**) &per_block_results, sizeof(double)*(number_of_blocks + 1)); 
// launch one kernel to compute, per-block, a partial sum 
kernelSum<double> <<<number_of_blocks, BLOCK_SIZE, BLOCK_SIZE*sizeof(double)>>>(a, per_block_results, n); 
// launch a single block to compute the sum of the partial sums 
kernelSum<double> <<<1, number_of_blocks, number_of_blocks*sizeof(double)>>>(per_block_results, per_block_results + number_of_blocks, number_of_blocks);

我可以概括這個內核任意數量的列的矩陣，但我受共享內存的限制。我的GPU具有計算能力3.5，因此它具有共享內存的48KB和最大塊大小1024即每塊的線程數。由於我對雙精度感興趣，因此我擁有共享內存的最大雙倍數48*1024/8= 6144。由於減少是每塊完成的，所以我可以有最多6144 (doubles in shared memory)/1024 (block size) = 6列，我可以同時計算總和減少量。減小塊大小將允許同時計算更多的列，例如， 6144 (doubles in shared memory)/512 (block size) = 12。

這個更復雜的策略是否會擊敗矩陣的每一列上的簡單CPU循環並調用總和減少。有沒有更好的方法來做到這一點？

來源

2014-01-29 Giovanni Azua

一個簡單的替代辦法是設置您的問題，因爲一個矩陣向量乘你矩陣和所有'1'的向量使用'cublas gemv（）'。 – JackOLantern

事實上，一個解決方案實際上是給予矩陣A來做的：A^T * 1，這會給出列的和。請詳細說明答案，我會接受。但在不利的情況下做這麼多的FLOP就像是浪費。它寫道：我會濫用GEMV內核，因爲實際上不需要乘法。 –

@talonmies回答了您的具體問題。確實，您正在加載虛擬'1'矢量並將矩陣'A'的元素與它們相乘，從而有點濫用'cublas gemv（）'。但是cuBLAS例程經過高度優化，評估你自己的實現是否比我們正在談論的「天真」cuBLAS更快或更快。也許，您可能有興趣對talonmie的解決方案進行比較，並在此處發佈結果... – JackOLantern

什麼阻止你做這樣的事情：

template<typename T> 
__global__ void kernelSum(const T* __restrict__ input, 
          T* __restrict__ per_block_results, 
          const size_t lda, const size_t n) 
{ 
    extern __shared__ T sdata[]; 

    // Accumulate per thread partial sum 
    T x = 0.0; 
    T * p = &input[blockIdx.x * lda]; 
    for(int i=threadIdx.x; i < n; i += blockDim.x) { 
     x += p[i]; 
    } 

    // load partial sum into __shared__ memory 
    sdata[threadIdx.x] = x; 
    __syncthreads(); 

    // contiguous range pattern 
    for(int offset = blockDim.x/2; offset > 0; offset >>= 1) { 
     if(threadIdx.x < offset) { 
      // add a partial sum upstream to our own 
      sdata[threadIdx.x] += sdata[threadIdx.x + offset]; 
     } 
     // wait until all threads in the block have 
     // updated their partial sums 
     __syncthreads(); 
    } 

    // thread 0 writes the final result 
    if(threadIdx.x == 0) { 
     per_block_results[blockIdx.x] = sdata[0]; 
    } 
}

[標準免責聲明：寫在瀏覽器中，從來沒有編譯或測試，在風險自負]

即。對於共享內存減少，您只需要爲塊中的每個線程使用sdata中的一個條目。每個線程總計儘可能多的值以覆蓋整列輸入。然後沒有共享內存限制，您可以使用相同的塊大小對任何大小的列進行求和。

編輯：顯然，使用每線程部分和的想法是新的給你，所以這裏是一個完整的例子來研究：

#include <iostream> 

template<typename T> 
__global__ 
void kernelSum(const T* __restrict__ input, 
     const size_t lda, // pitch of input in words of sizeof(T) 
     T* __restrict__ per_block_results, 
       const size_t n) 
{ 
    extern __shared__ T sdata[]; 

    T x = 0.0; 
    const T * p = &input[blockIdx.x * lda]; 
    // Accumulate per thread partial sum 
    for(int i=threadIdx.x; i < n; i += blockDim.x) { 
     x += p[i]; 
    } 

    // load thread partial sum into shared memory 
    sdata[threadIdx.x] = x; 
    __syncthreads(); 

    for(int offset = blockDim.x/2; offset > 0; offset >>= 1) { 
     if(threadIdx.x < offset) { 
      sdata[threadIdx.x] += sdata[threadIdx.x + offset]; 
     } 
     __syncthreads(); 
    } 

    // thread 0 writes the final result 
    if(threadIdx.x == 0) { 
     per_block_results[blockIdx.x] = sdata[0]; 
    } 
} 


int main(void) 
{ 
    const int m = 10000, n = 16; 
    float * a = new float[m*n]; 

    for(int i=0; i<(m*n); i++) { a[i] = (float)(i%10); } 

    float *a_; 
    size_t size_a = m * n * sizeof(float); 
    cudaMalloc((void **)&a_, size_a); 
    cudaMemcpy(a_, a, size_a, cudaMemcpyHostToDevice); 

    float *b_; 
    size_t size_b = n * sizeof(float); 
    cudaMalloc((void **)&b_, size_b); 

    // select number of warps per block according to size of the 
    // colum and launch one block per column. Probably makes sense 
    // to have at least 4:1 column size to block size 
    dim3 blocksize(256); 
    dim3 gridsize(n); 
    size_t shmsize = sizeof(float) * (size_t)blocksize.x; 
    kernelSum<float><<<gridsize, blocksize, shmsize>>>(a_, b_, m, m); 

    float * b = new float[n]; 
    cudaMemcpy(b, b_, size_b, cudaMemcpyDeviceToHost); 

    for(int i=0; i<n; i++) { 
     std::cout << i << " " << b[i] << std::endl; 
    } 

    cudaDeviceReset(); 

    return 0; 
}

您應該相對於你的矩陣塊大小實驗以獲得最佳性能，但通常情況下，內核所做的每個線程的工作量越多，總體性能就越好（因爲共享內存減少相當昂貴）。您可以在this answer中看到一種針對類似內存帶寬約束問題的阻塞和網格大小啓發式方法。

來源

2014-01-29 14:12:15 talonmies

對不起，你可以提供一個示例內核調用，我不能看到它是如何工作的。 –

@GiovanniAzua：請看編輯中的代碼。根據你對這個memset內核的另一個回答，你似乎可能需要花一些時間來研究整個「每個線程的多個數據項」設計模式。這是一個非常強大的技術，可以改善像這樣的簡單的內存綁定操作的性能。 – talonmies

謝謝！很有幫助。 –

以替代已經talonmies提供的答案，我在這裏報告4列減少其他方法的基礎上，採用基於使用cublas<t>gemv()與1的一列CUDA推力和1他們3，作爲建議在我上面的評論中。

的CUDA推力方法是Reduce matrix rows with CUDA與由

thrust::make_permutation_iterator(d_matrix.begin(),     
     thrust::make_transform_iterator(thrust::make_counting_iterator(0), 
       (_1 % Nrows) * Ncols + _1/Nrows))

獲得的隱式換位的類似下面是全碼：

#include <cublas_v2.h> 

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/generate.h> 
#include <thrust/reduce.h> 
#include <thrust/functional.h> 
#include <thrust/random.h> 
#include <thrust/sequence.h> 

#include <stdio.h> 
#include <iostream> 

#include "Utilities.cuh" 
#include "TimingGPU.cuh" 

using namespace thrust::placeholders; 

// --- Required for approach #2 
__device__ float *vals; 

/**************************************************************/ 
/* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */ 
/**************************************************************/ 
template <typename T> 
struct linear_index_to_row_index : public thrust::unary_function<T,T> { 

    T Ncols; // --- Number of columns 

    __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {} 

    __host__ __device__ T operator()(T i) { return i/Ncols; } 
}; 

/******************************************/ 
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */ 
/******************************************/ 
struct col_reduction { 

    const int Nrows; // --- Number of rows 
    const int Ncols; // --- Number of cols 

    col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {} 

    __device__ float operator()(float& x, int& y) { 
     float temp = 0.f; 
     for (int i = 0; i<Nrows; i++) { 
      temp += vals[y + (i*Ncols)]; 
     } 
     return temp; 
    } 
}; 

/**************************/ 
/* NEEDED FOR APPROACH #3 */ 
/**************************/ 
template<typename T> 
struct MulC: public thrust::unary_function<T, T> 
{ 
    T C; 
    __host__ __device__ MulC(T c) : C(c) { } 
    __host__ __device__ T operator()(T x) { return x * C; } 
}; 

/********/ 
/* MAIN */ 
/********/ 
int main() 
{ 
    const int Nrows = 5;  // --- Number of rows 
    const int Ncols = 8;  // --- Number of columns 

    // --- Random uniform integer distribution between 10 and 99 
    thrust::default_random_engine rng; 
    thrust::uniform_int_distribution<int> dist(10, 99); 

    // --- Matrix allocation and initialization 
    thrust::device_vector<float> d_matrix(Nrows * Ncols); 
    for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng); 

    TimingGPU timerGPU; 

    /***************/ 
    /* APPROACH #1 */ 
    /***************/ 
    timerGPU.StartCounter(); 
    // --- Allocate space for row sums and indices 
    thrust::device_vector<float> d_col_sums(Ncols); 
    thrust::device_vector<int> d_col_indices(Ncols); 

    // --- Compute row sums by summing values with equal row indices 
    thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)), 
          thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), 
          thrust::make_permutation_iterator(
           d_matrix.begin(), 
           thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1/Nrows)), 
          d_col_indices.begin(), 
          d_col_sums.begin(), 
          thrust::equal_to<int>(), 
          thrust::plus<float>()); 

    //thrust::reduce_by_key(
//    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)), 
//    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), 
//    thrust::make_permutation_iterator(
    //    d_matrix.begin(), 
    //    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1/Nrows)), 
//    thrust::make_discard_iterator(), 
//    d_col_sums.begin()); 

    printf("Timing for approach #1 = %f\n", timerGPU.GetCounter()); 

    // --- Print result 
    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums[j] << "\n"; 
    } 

    /***************/ 
    /* APPROACH #2 */ 
    /***************/ 
    timerGPU.StartCounter(); 
    thrust::device_vector<float> d_col_sums_2(Ncols, 0); 
    float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]); 
    gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *))); 
    thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator<int>(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols)); 

    printf("Timing for approach #2 = %f\n", timerGPU.GetCounter()); 

    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums_2[j] << "\n"; 
    } 

    /***************/ 
    /* APPROACH #3 */ 
    /***************/ 

    timerGPU.StartCounter(); 
    thrust::device_vector<float> d_col_sums_3(Ncols, 0); 
    thrust::device_vector<float> d_temp(Nrows * Ncols); 
    thrust::inclusive_scan_by_key(
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)), 
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), 
       thrust::make_permutation_iterator(
         d_matrix.begin(), 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1/Nrows)), 
       d_temp.begin()); 
    thrust::copy(
       thrust::make_permutation_iterator(
         d_temp.begin() + Nrows - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))), 
       thrust::make_permutation_iterator(
         d_temp.begin() + Nrows - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))) + Ncols, 
       d_col_sums_3.begin()); 

    printf("Timing for approach #3 = %f\n", timerGPU.GetCounter()); 

    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums_3[j] << "\n"; 
    } 

    /***************/ 
    /* APPROACH #4 */ 
    /***************/ 
    cublasHandle_t handle; 

    timerGPU.StartCounter(); 
    cublasSafeCall(cublasCreate(&handle)); 

    thrust::device_vector<float> d_col_sums_4(Ncols); 
    thrust::device_vector<float> d_ones(Nrows, 1.f); 

    float alpha = 1.f; 
    float beta = 0.f; 
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, 
           thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1)); 

    printf("Timing for approach #4 = %f\n", timerGPU.GetCounter()); 

    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums_4[j] << "\n"; 
    } 

    return 0; 
}

來源

2015-04-17 05:21:27 JackOLantern

用CUDA減少矩陣列

回答

相關問題