使用cuSPARSE進行密集稀疏和稀疏到密集的轉換

以下程序使用cuSPARSE測試密集到稀疏轉換。它在前幾行輸出中產生垃圾。但是，如果我將標記爲(2)的行移動到標記爲(1)的行後面，那麼該程序可以正常工作。有人能告訴我可能是什麼原因嗎？使用cuSPARSE進行密集稀疏和稀疏到密集的轉換

編輯：爲了使演示更清楚，我重寫了程序與thrust，同樣的問題仍然存在。

編輯：作爲建議的羅伯特，我改回的版本不thrust和新增的API級別的錯誤校驗碼。

#include <iostream> 
#include <cusparse_v2.h> 

using std::cerr; 
using std::cout; 
using std::endl; 

#define WRAP(x) do {x} while (0) 
#define CHKcusparse(x) WRAP(          \ 
    cusparseStatus_t err = (x);          \ 
    if (err != CUSPARSE_STATUS_SUCCESS) {        \ 
    cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line " \ 
     << __LINE__ << " of " << __FILE__ << ": " << #x << endl; \ 
    exit(1);              \ 
    }                 \ 
) 
#define CHKcuda(x) WRAP(           \ 
    cudaError_t err = (x);            \ 
    if (err != cudaSuccess) {           \ 
    cerr << "Cuda Error #" << int(err) << ", \""      \ 
     << cudaGetErrorString(err) << "\" at Line " << __LINE__  \ 
     << " of " << __FILE__ << ": " << #x << endl;    \ 
    exit(1);               \ 
    }                 \ 
) 
#define ALLOC(X, T, N) do {       \ 
    h##X = (T*) malloc(sizeof(T) * (N));     \ 
    CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \ 
} while(0) 

int main() { 
    srand(100); 

    cusparseHandle_t g_cusparse_handle; 
    CHKcusparse(cusparseCreate(&g_cusparse_handle)); 

    const int n = 100, in_degree = 10; 
    int nnz = n * in_degree, nn = n * n; 

    int *dnnz, *dridx, *dcols; 
    int *hnnz, *hridx, *hcols; 
    float *dvals, *dmat; 
    float *hvals, *hmat; 

    // (1) The number of non-zeros in each column. 
    ALLOC(nnz, int, n); 

    // The dense matrix. 
    ALLOC(mat, float, nn); 

    // The values in sparse matrix. 
    ALLOC(vals, float, nnz); 

    // (2) The row indices of the sparse matrix. 
    ALLOC(ridx, int, nnz); 

    // The column offsets of the sparse matrix. 
    ALLOC(cols, int, n+1); 

    // Fill and copy dense matrix and number of non-zeros. 
    for (int i = 0; i < nn; i++) {hmat[i] = rand();} 
    for (int i = 0; i < n; i++) {hnnz[i] = in_degree;} 
    CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice)); 
    CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice)); 
    CHKcuda(cudaDeviceSynchronize()); 

    // Perform dense to CSC format 
    cusparseMatDescr_t cspMatDesc; 
    CHKcusparse(cusparseCreateMatDescr(&cspMatDesc)); 
    CHKcusparse(cusparseSdense2csc(
     g_cusparse_handle, n, n, cspMatDesc, dmat, n, 
     dnnz, dvals, dridx, dcols 
)); 

    // Copy row indices back. 
    CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost)); 
    CHKcuda(cudaDeviceSynchronize()); 
    CHKcusparse(cusparseDestroyMatDescr(cspMatDesc)); 

    // Display row indices. 
    for (int i = 0; i < n; i++) { 
    for (int j = 0; j < in_degree; j++) { 
     std::cout << hridx[i * in_degree + j] << ", "; 
    } 
    std::cout << std::endl; 
    } 

    CHKcuda(cudaFree(dnnz)); 
    CHKcuda(cudaFree(dvals)); 
    CHKcuda(cudaFree(dridx)); 
    CHKcuda(cudaFree(dcols)); 
    CHKcuda(cudaFree(dmat)); 
    free(hnnz); 
    free(hmat); 
    free(hvals); 
    free(hridx); 
    free(hcols); 
    return 0; 
}

來源

2014-01-13 shaoyl85

沒有錯誤檢查？在向其他人求助之前，您應該利用針對CUDA和cusparse API的基本API級錯誤檢查。你有cusparse函數返回錯誤，無論第1行或第2行是否定位。你聲明每列的nnz是10，但實際上你正在用每列超過10個非零元素初始化你的密集矩陣，這導致了密集到稀疏轉換爆炸。 Cusparse提供了預先計算每列nnz的函數。但在你的情況下，你可以簡單地通過將'in_degree'設置爲100而不是10來消除錯誤。 –

感謝您的提醒。我從一個大的代碼庫中提取它來提問。我已經測試過這些調用，他們都返回成功。至於密集矩陣，我打算用n來生成稠密矩陣n，然後將它轉化爲大小爲n的稀疏矩陣，每列有10個非零元素。如果這是設置，我的方式調用轉換功能是否正確？還是有什麼我明白錯誤的？ – shaoyl85

@RobertCrovella對不起，我明白你的觀點，你的意思是如果我的密集矩陣在每一列中有10個以上的非零元素，我不能稱nnz每列等於10的轉換？轉換不會自動選擇10個最不爲零的元素嗎？ – shaoyl85

基本問題是您將內部不一致的數據傳遞到dense-to-sparse routine。您正在傳遞一個密集的矩陣，每列有100個非零元素，但是您告訴說每個列只有10個非零元素。

如果您使用cuda-memcheck運行您的代碼，您將看到存在錯誤的錯誤。

對於這段代碼，你可以通過改變你的in_degree變量解決問題100

對於一般的情況下，cusparse提供a convenient routine正確填充每列非零元素的數量。

來源

2014-01-13 18:18:12

所以我認爲我真正的問題是如何有效地從每列中選擇具有最大絕對值的k個元素，並將其餘值設置爲零。你有什麼主意嗎？我認爲它歸結爲多個k選擇並行的問題。 – shaoyl85

唯一想到的就是在每一列上做一個[按鍵排序]（http://thrust.github.io/doc/group__sorting.html#ga2bb765aeef19f6a04ca8b8ba11efff24）（key = element絕對值，value = element索引），然後按值（元素索引）將前k個原始元素填充回零列。我建議你把它作爲一個新的SO問題來提出更好的想法。 –

正如Robert Crovella所強調的那樣，使用cuSPARSE和cusparse<t>nnz()和cusparse<t>dense2csr()例程可以有效地執行從密集到稀疏的傳遞。反之亦然可以通過cusparse<t>csr2dense()例程完成。下面是一個完整的例子，展示瞭如何在CSR格式中使用cuSPARSE從密集到稀疏，反之亦然。

cuSparseUtilities.cuh

#ifndef CUSPARSEUTILITIES_CUH 
#define CUSPARSEUTILITIES_CUH 

#include "cusparse_v2.h" 

void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t); 
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
    int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
    const cusparseHandle_t handle, const int Nrows, const int Ncols); 

#endif

cuSparseUtilities.cu

#include "cuSparseUtilities.cuh" 
#include "Utilities.cuh" 

/*****************************/ 
/* SETUP DESCRIPTOR FUNCTION */ 
/*****************************/ 
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) { 
    cusparseSafeCall(cusparseCreateMatDescr(&descrA)); 
    cusparseSafeCall(cusparseSetMatType(descrA, matrixType)); 
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase)); 
} 

/********************************************************/ 
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */ 
/********************************************************/ 
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
        int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
        const cusparseHandle_t handle, const int Nrows, const int Ncols) { 

    const int lda = Nrows;      // --- Leading dimension of dense matrix 

    gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int))); 

    // --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense 
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz)); 

    // --- Device side sparse matrix 
    gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double))); 
    gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int))); 
    gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int))); 

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0])); 

}

kernel.cu

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 

#include <cusparse_v2.h> 

#include "cuSparseUtilities.cuh" 
#include "Utilities.cuh" 

/********/ 
/* MAIN */ 
/********/ 
int main() { 

    cusparseHandle_t handle; 

    // --- Initialize cuSPARSE 
    cusparseSafeCall(cusparseCreate(&handle)); 

    cusparseMatDescr_t descrA = 0; 

    /**************************/ 
    /* SETTING UP THE PROBLEM */ 
    /**************************/ 
    const int Nrows = 5;      // --- Number of rows 
    const int Ncols = 4;      // --- Number of columns 
    const int N = Nrows; 

    // --- Host side dense matrix 
    double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense)); 

    // --- Column-major storage 
    h_A_dense[ 0] = 0.4612f; h_A_dense[ 5] = -0.0006f; h_A_dense[10] = 1.3f;  h_A_dense[15] = 0.0f; 
    h_A_dense[ 1] = 0.0f;  h_A_dense[ 6] = 1.443f;  h_A_dense[11] = 0.0f;  h_A_dense[16] = 0.0f; 
    h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f; h_A_dense[12] = 0.0723f; h_A_dense[17] = 0.0f; 
    h_A_dense[ 3] = 0.3566f; h_A_dense[ 8] = 0.0723f; h_A_dense[13] = 0.7543f; h_A_dense[18] = 0.0f; 
    h_A_dense[ 4] = 0.f;  h_A_dense[ 9] = 0.0f;  h_A_dense[14] = 0.0f;  h_A_dense[19] = 0.1f; 

    // --- Create device array and copy host array to it 
    double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double))); 
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice)); 

    /*******************************/ 
    /* FROM DENSE TO SPARSE MATRIX */ 
    /*******************************/ 
    // --- Descriptor for sparse matrix A 
    setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE); 

    int nnz = 0;        // --- Number of nonzero elements in dense matrix 
    int *d_nnzPerVector;      // --- Device side number of nonzero elements per row 

    double *d_A;        // --- Sparse matrix values - array of size nnz 
    int *d_A_RowIndices;      // --- "Row indices" 
    int *d_A_ColIndices;      // --- "Column indices" 

    dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols); 

    /*******************************************************/ 
    /* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */ 
    /*******************************************************/ 
    // --- Host side number of nonzero elements per row 
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int)); 
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost)); 

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz); 
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]); 
    printf("\n"); 

    // --- Host side sparse matrix 
    double *h_A = (double *)malloc(nnz * sizeof(double)); 
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int)); 
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int)); 
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost)); 

    printf("\nOriginal matrix in CSR format\n\n"); 
    for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n"); 

    printf("\n"); 
    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n"); 

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]); 

    /*******************************/ 
    /* FROM SPARSE TO DENSE MATRIX */ 
    /*******************************/ 
    double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double))); 
    cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices, 
             d_A_denseReconstructed, Nrows)); 

    /*******************************************************/ 
    /* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */ 
    /*******************************************************/ 
    double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double)); 
    gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost)); 

    printf("\nReconstructed dense matrix \n"); 
    for (int m = 0; m < Nrows; m++) { 
     for (int n = 0; n < Ncols; n++) 
      printf("%f\t", h_A_denseReconstructed[n * Nrows + m]); 
     printf("\n"); 
    } 

    return 0; 
}

來源

2018-01-25 18:14:19 JackOLantern

使用cuSPARSE進行密集稀疏和稀疏到密集的轉換

回答

相關問題