2011-05-11 65 views
8

我正在嘗試在GPU(使用CUDA)上實現矩陣向量乘法。在我的C++代碼(CPU)中,我將矩陣加載爲一個稠密矩陣,然後使用CUDA執行矩陣向量乘法。我也使用共享內存來提高性能。CUDA中的稀疏矩陣向量乘法

  1. 如何知道我的矩陣是一個稀疏矩陣以有效的方式加載矩陣?

下面是我的C++函數加載矩陣:

int readMatrix(char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR) 
{ 
    unsigned int w, h, x, y, num_entries; 

    float val; 

    std::ifstream file(filename); 

    if (file) 
    { 
     file >> h >> w >> num_entries; 
     cout << w << " " << h << " " << num_entries << "\n"; 

     assert(w == h || w == 1 || h == 1); 

     if(dim != NULL) *dim = std::max(w, h); 

     matrix = new float[ w * h ]; 

     unsigned int i; 
     for(i = 0; i < num_entries; i++){ 

      if(file.eof()) break; 

      file >> y >> x >> val; 

      if(majority == ROW_MAJOR){ 

       matrix[ w * y + x ] = val; 

      } else if(majority == COLUMN_MAJOR){ 

       matrix[ h * x + y ] = val; 
      } 
     } 
     file.close(); 

     if(i == num_entries) 
      std::cout << "\nFile read successfully\n"; 
     else 
      std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n"; 

     // print first few elements 
     if(w == h){ 
      for(unsigned int i = 0; i < w; i++){ 

       printf("\n"); 
       for(unsigned int j = 0; j < h; j++){ 

        printf("%.2f ", matrix[ j + w * i ]); 
       } 
      } 
     } 
     else{ 

      printf("\n"); 
      for(unsigned int j = 0; j < h; j++){ 

       printf("%.2f ", matrix[ j ]); 
      } 
     } 

    } else { 

     std::cout << "Unable to open file\n"; 
     return false; 
    } 

    return true; 
} 

下面是我的CUDA內核函數處理該矩陣 - 向量乘法:

__global__ void 
_cl_matrix_vector_(float *A, float *b, float *x, int dim) 
{ 
    extern __shared__ float vec[]; 
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 
    float temp = 0.0; 
    int vOffs = 0; 

    //load vector into shared memory 
    for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) { 
     vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x]; 
    } 

    //make sure all threads are synchronized 
    __syncthreads(); 

    if (idx < dim) { 
     temp = 0.0; 
     //dot product (multiplication) 
     for (int i = 0; i < dim; i++){ 
      temp += A[idx * dim + i] * vec[i]; 
     } 
     x[idx] = temp; 
    } 

} 
  • 我必須對我的CUDA代碼進行必要的修改,以考慮到我的矩陣是一個稀疏矩陣?
  • 我從論壇中發現我們也可以使用填充來優化性能,但這需要我改變讀取矩陣/排序矩陣的方式。任何想法如何實現這種填充的方式我讀矩陣和執行計算?
  • +1

    正確答案完全取決於稀疏矩陣的存儲格式。請參閱http://www.nvidia.com/object/nvidia_research_pub_001.html,其中討論了GPU上不同稀疏格式的優點。 – talonmies 2011-05-12 09:38:59

    回答

    2

    您可能想看看非常好的CUSP庫。他們以各種格式實現稀疏矩陣(coo,csr,ellpack,對角線以及ellpack和coo之間的混合)。每個都有自己的優勢,如文檔中所述。它們中的大多數都是「標準」稀疏矩陣格式,您可以在其中找到更多信息。也許不完全回答你的問題,但它應該提供一個起點。

    +0

    CUSP處理了整個事情,即使它也定義了共軛梯度法。但是我需要在Cuda上實現一個實現,並且我需要手動執行它來了解它是如何工作的(不只是調用一個現成的函數來處理這個問題)。不過謝謝你的回答,巴特。 – 2011-05-11 21:23:41

    +0

    @all_by_grace我不是說你應該使用CUSP。我的意思是它提供了一個很好的通用實現,你可以用它來獲取靈感。當然它使用CUDA。無論如何,祝你工作順利。 ;) – Bart 2011-05-11 21:26:18

    4

    這是一篇非常古老的文章,我想強調一下cuSPARSE(因爲有些時候了)爲稀疏矩陣之間或者稀疏矩陣和密集向量之間的乘法生成例程。

    對於csr格式,稀疏矩陣和密集向量之間相乘的相關例程爲cusparse<t>csrmv。下面是一個充分發揮其用途的示例。

    #include <stdio.h> 
    #include <stdlib.h> 
    #include <iostream> 
    #include <assert.h> 
    
    #include "Utilities.cuh" 
    
    #include <cuda_runtime.h> 
    #include <cusparse_v2.h> 
    
    /********/ 
    /* MAIN */ 
    /********/ 
    int main() 
    { 
        // --- Initialize cuSPARSE 
        cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle)); 
    
        /**************************/ 
        /* SETTING UP THE PROBLEM */ 
        /**************************/ 
        const int N  = 4;    // --- Number of rows and columns 
    
        // --- Host side dense matrices 
        double *h_A_dense = (double*)malloc(N * N * sizeof(double)); 
        double *h_x_dense = (double*)malloc(N *  sizeof(double)); 
        double *h_y_dense = (double*)malloc(N *  sizeof(double)); 
    
        // --- Column-major ordering 
        h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006;  h_A_dense[8] = 0.3566;  h_A_dense[12] = 0.0; 
        h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640;  h_A_dense[9] = 0.0723;  h_A_dense[13] = 0.0; 
        h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723;  h_A_dense[10] = 0.7543;  h_A_dense[14] = 0.0; 
        h_A_dense[3] = 0.;  h_A_dense[7] = 0.0;   h_A_dense[11] = 0.0;  h_A_dense[15] = 0.1; 
    
        // --- Initializing the data and result vectors 
        for (int k = 0; k < N; k++) { 
         h_x_dense[k] = 1.; 
         h_y_dense[k] = 0.; 
        } 
    
        // --- Create device arrays and copy host arrays to them 
        double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(double))); 
        double *d_x_dense; gpuErrchk(cudaMalloc(&d_x_dense, N  * sizeof(double))); 
        double *d_y_dense; gpuErrchk(cudaMalloc(&d_y_dense, N  * sizeof(double))); 
        gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(double), cudaMemcpyHostToDevice)); 
        gpuErrchk(cudaMemcpy(d_x_dense, h_x_dense, N  * sizeof(double), cudaMemcpyHostToDevice)); 
        gpuErrchk(cudaMemcpy(d_y_dense, h_y_dense, N  * sizeof(double), cudaMemcpyHostToDevice)); 
    
        // --- Descriptor for sparse matrix A 
        cusparseMatDescr_t descrA;  cusparseSafeCall(cusparseCreateMatDescr(&descrA)); 
        cusparseSafeCall(cusparseSetMatType  (descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); 
        cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE)); 
    
        int nnzA = 0;       // --- Number of nonzero elements in dense matrix A 
    
        const int lda = N;      // --- Leading dimension of dense matrix 
    
        // --- Device side number of nonzero elements per row of matrix A 
        int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA))); 
        cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA)); 
    
        // --- Host side number of nonzero elements per row of matrix A 
        int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA)); 
        gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost)); 
    
        printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA); 
        for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]); 
        printf("\n"); 
    
        // --- Device side sparse matrix 
        double *d_A;   gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A))); 
    
        int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices))); 
        int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices))); 
    
        cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices)); 
    
        // --- Host side sparse matrices 
        double *h_A = (double *)malloc(nnzA * sizeof(*h_A));   
        int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices)); 
        int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices)); 
        gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost)); 
        gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost)); 
        gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost)); 
    
        printf("\nOriginal matrix A in CSR format\n\n"); 
        for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n"); 
    
        printf("\n"); 
        for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n"); 
    
        printf("\n"); 
        for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]); 
    
        printf("\n"); 
        for (int i = 0; i < N; ++i) printf("h_x[%i] = %f \n", i, h_x_dense[i]); printf("\n"); 
    
        const double alpha = 1.; 
        const double beta = 0.; 
        cusparseSafeCall(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nnzA, &alpha, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x_dense, 
                &beta, d_y_dense)); 
    
        gpuErrchk(cudaMemcpy(h_y_dense,   d_y_dense,   N * sizeof(double), cudaMemcpyDeviceToHost)); 
    
        printf("\nResult vector\n\n"); 
        for (int i = 0; i < N; ++i) printf("h_y[%i] = %f ", i, h_y_dense[i]); printf("\n"); 
    
    }