在CUDA中轉換矩陣的最有效方法是什麼？

我有一個M*N主機內存矩陣，並在複製到設備內存時，我需要它被轉置爲N*M矩陣。有沒有任何cuda（cuBLAS ...）API這樣做？我正在使用CUDA 4.謝謝！在CUDA中轉換矩陣的最有效方法是什麼？

2013-03-17 Hailiang Zhang

它是[文件]中的右（http://docs.nvidia.com/cuda/cublas/index.html#topic_9_1），如果你注意看.... – talonmies 2013-03-17 08:05:37

謝謝！但是，由於觀察到cublas迴歸，並且很長一段時間後提交nVidia錯誤報告後沒有迴應，我正在使用CUDA4而不是CUDA5。 – 2013-03-17 18:26:03

cublas<t>geam() 

This function performs the matrix-matrix addition/transposition 
the user can transpose matrix A by setting *alpha=1 and *beta=0.

（並指定transa操作者如對CUBLAS_OP_T轉置）

來源

2013-03-17 07:52:24

謝謝羅伯特！但是我意識到cubas geam僅適用於CUDA-5，這對我來說似乎不是一種選擇。不過謝謝！ – 2013-03-17 18:19:44

在很多情況下，不需要顯式轉換，因爲您可以簡單地使用大多數BLAS函數提供的轉置標誌。以下白皮書描述瞭如果必須將其作爲單獨的步驟完成轉換，請執行以下白皮書：http://docs.nvidia.com/cuda/samples/6_Advanced/transpose/doc/MatrixTranspose.pdf – njuffa 2013-03-17 20:25:09

另請參閱以前的問題： http://stackoverflow.com/questions/13782012/how-to-transpose-a-matrix-in-cuda-cublas?rq=1 – njuffa 2013-03-17 20:29:04

CULA具有輔助例程來計算轉置（culaDevice?geTranspose）。在矩形矩陣的情況下，您也可以使用就地換位（culaDevise?geTransposeInplace）。

注意：如果您符合特定條件，CULA有免費許可證。

來源

2013-05-02 06:57:48 Stefan

爲了回答您關於效率的問題，我比較了兩種方法來執行矩陣轉置，一種使用Thrust庫，另一種使用Robert Crovella建議的使用cublas<t>geam。比較的結果是一個開普勒K20C卡在以下方面：

| Matrix size | Thrust [ms] | cuBLAS [ms] | 
|    |    |    | 
| 32x32   | 0.015   | 0.016   | 
| 64x64   | 0.015   | 0.017   | 
| 128x128  | 0.019   | 0.017   | 
| 256x256  | 0.028   | 0.017   | 
| 512x512  | 0.088   | 0.042   | 
| 1024x1024  | 0.34   | 0.13   | 
| 2048x2048  | 1.24   | 0.48   | 
| 4096x4096  | 11.02   | 1.98   |

如可以看到的，使用cublas<t>geam推力勝過版本。以下是執行比較的代碼。

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/functional.h> 
#include <thrust/gather.h> 
#include <thrust/scan.h> 
#include <thrust/iterator/counting_iterator.h> 
#include <thrust/iterator/transform_iterator.h> 
#include <iostream> 
#include <iomanip> 
#include <cublas_v2.h> 
#include <conio.h> 
#include <assert.h> 

/**********************/ 
/* cuBLAS ERROR CHECK */ 
/**********************/ 
#ifndef cublasSafeCall 
#define cublasSafeCall(err)  __cublasSafeCall(err, __FILE__, __LINE__) 
#endif 

inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line) 
{ 
    if(CUBLAS_STATUS_SUCCESS != err) { 
     fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err); 
     getch(); cudaDeviceReset(); assert(0); 
    } 
} 

// convert a linear index to a linear index in the transpose 
struct transpose_index : public thrust::unary_function<size_t,size_t> 
{ 
    size_t m, n; 

    __host__ __device__ 
    transpose_index(size_t _m, size_t _n) : m(_m), n(_n) {} 

    __host__ __device__ 
    size_t operator()(size_t linear_index) 
    { 
     size_t i = linear_index/n; 
     size_t j = linear_index % n; 

     return m * j + i; 
    } 
}; 

// convert a linear index to a row index 
struct row_index : public thrust::unary_function<size_t,size_t> 
{ 
    size_t n; 

    __host__ __device__ 
    row_index(size_t _n) : n(_n) {} 

    __host__ __device__ 

    size_t operator()(size_t i) 
    { 
     return i/n; 
    } 
}; 

// transpose an M-by-N array 
template <typename T> 
void transpose(size_t m, size_t n, thrust::device_vector<T>& src, thrust::device_vector<T>& dst) 
{ 
    thrust::counting_iterator<size_t> indices(0); 

    thrust::gather 
    (thrust::make_transform_iterator(indices, transpose_index(n, m)), 
    thrust::make_transform_iterator(indices, transpose_index(n, m)) + dst.size(), 
    src.begin(),dst.begin()); 
} 

// print an M-by-N array 
template <typename T> 
void print(size_t m, size_t n, thrust::device_vector<T>& d_data) 
{ 
    thrust::host_vector<T> h_data = d_data; 

    for(size_t i = 0; i < m; i++) 
    { 
     for(size_t j = 0; j < n; j++) 
      std::cout << std::setw(8) << h_data[i * n + j] << " "; 
      std::cout << "\n"; 
    } 
} 

int main(void) 
{ 
    size_t m = 5; // number of rows 
    size_t n = 4; // number of columns 

    // 2d array stored in row-major order [(0,0), (0,1), (0,2) ... ] 
    thrust::device_vector<double> data(m * n, 1.); 
    data[1] = 2.; 
    data[3] = 3.; 

    std::cout << "Initial array" << std::endl; 
    print(m, n, data); 

    std::cout << "Transpose array - Thrust" << std::endl; 
    thrust::device_vector<double> transposed_thrust(m * n); 
    transpose(m, n, data, transposed_thrust); 
    print(n, m, transposed_thrust); 

    std::cout << "Transpose array - cuBLAS" << std::endl; 
    thrust::device_vector<double> transposed_cuBLAS(m * n); 
    double* dv_ptr_in = thrust::raw_pointer_cast(data.data()); 
    double* dv_ptr_out = thrust::raw_pointer_cast(transposed_cuBLAS.data()); 
    double alpha = 1.; 
    double beta = 0.; 
    cublasHandle_t handle; 
    cublasSafeCall(cublasCreate(&handle)); 
    cublasSafeCall(cublasDgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, dv_ptr_in, n, &beta, dv_ptr_in, n, dv_ptr_out, m)); 
    print(n, m, transposed_cuBLAS); 

    getch(); 

    return 0; 
}

來源

2014-02-15 20:48:38 JackOLantern

+1非常具有啓發性。 – 2014-02-16 03:17:36

你可以嘗試解釋巨大差異的起源嗎？他們是由一個顯着不同的算法造成的？可以在推力版本中優化內存訪問嗎？ – 2015-07-01 09:14:26

在CUDA中轉換矩陣的最有效方法是什麼？

回答

相關問題