CUBLAS：零點偏移矩陣的錯誤反演

自CUDA 5.5以來，CUBLAS庫包含用於批量矩陣分解和反演的例程（分別爲cublas<t>getrfBatched和）。CUBLAS：零點偏移矩陣的錯誤反演

從文檔中獲取指導，我使用這些例程編寫了一個用於反演N×N矩陣的測試代碼。只有當矩陣具有所有非零樞軸時，代碼纔會給出正確的輸出。將任何樞軸設置爲零都會導致不正確的結果。我用MATLAB驗證了結果。

我意識到我提供行主矩陣作爲輸入，而CUBLAS期望列主矩陣，但它應該不重要，因爲它只會轉置結果。可以肯定的是，我也對列主要輸入進行了測試，但得到相同的行爲。

我很困惑，因爲cublas<t>getriBatched預計樞軸交換信息數組P作爲輸入，這是cublas<t>getrfBatched的輸出。因此，如果通過換行消除任何零樞軸，那麼反轉程序應該自動處理它。

如何使用CUBLAS對含有零點的矩陣進行反演？

以下是不同的測試用例一個自包含編譯能夠例如：這樣3<=n<=16

#include <cstdio> 
#include <cstdlib> 
#include <cuda_runtime.h> 
#include <cublas_v2.h> 

#define cudacall(call)                           \ 
    do                               \ 
    {                               \ 
     cudaError_t err = (call);                        \ 
     if(cudaSuccess != err)                         \ 
     {                              \ 
      fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 
      cudaDeviceReset();                         \ 
      exit(EXIT_FAILURE);                         \ 
     }                              \ 
    }                               \ 
    while (0) 

#define cublascall(call)                      \ 
    do                           \ 
    {                           \ 
     cublasStatus_t status = (call);                   \ 
     if(CUBLAS_STATUS_SUCCESS != status)                  \ 
     {                          \ 
      fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);  \ 
      cudaDeviceReset();                     \ 
      exit(EXIT_FAILURE);                     \ 
     }                          \ 
                               \ 
    }                           \ 
    while(0) 


void invert_device(float* src_d, float* dst_d, int n) 
{ 
    cublasHandle_t handle; 
    cublascall(cublasCreate_v2(&handle)); 

    int batchSize = 1; 

    int *P, *INFO; 

    cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int))); 
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int))); 

    int lda = n; 

    float *A[] = { src_d }; 
    float** A_d; 
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A))); 
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice)); 

    cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize)); 

    int INFOh = 0; 
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost)); 

    if(INFOh == n) 
    { 
     fprintf(stderr, "Factorization Failed: Matrix is singular\n"); 
     cudaDeviceReset(); 
     exit(EXIT_FAILURE); 
    } 

    float* C[] = { dst_d }; 
    float** C_d; 
    cudacall(cudaMalloc<float*>(&C_d,sizeof(C))); 
    cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice)); 

    cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize)); 

    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost)); 

    if(INFOh != 0) 
    { 
     fprintf(stderr, "Inversion Failed: Matrix is singular\n"); 
     cudaDeviceReset(); 
     exit(EXIT_FAILURE); 
    } 

    cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle); 
} 

void invert(float* src, float* dst, int n) 
{ 
    float* src_d, *dst_d; 

    cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float))); 
    cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice)); 
    cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float))); 

    invert_device(src_d,dst_d,n); 

    cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost)); 

    cudaFree(src_d), cudaFree(dst_d); 
} 

void test_invert() 
{ 
    const int n = 3; 

    //Random matrix with full pivots 
    float full_pivots[n*n] = { 0.5, 3, 4, 
           1, 3, 10, 
           4 , 9, 16 }; 

    //Almost same as above matrix with first pivot zero 
    float zero_pivot[n*n] = { 0, 3, 4, 
           1, 3, 10, 
           4 , 9, 16 }; 

    float zero_pivot_col_major[n*n] = { 0, 1, 4, 
             3, 3, 9, 
             4 , 10, 16 }; 

    float another_zero_pivot[n*n] = { 0, 3, 4, 
             1, 5, 6, 
             9, 8, 2 }; 

    float another_full_pivot[n * n] = { 22, 3, 4, 
             1, 5, 6, 
             9, 8, 2 }; 

    float singular[n*n] = {1,2,3, 
          4,5,6, 
          7,8,9}; 


    //Select matrix by setting "a" 
    float* a = zero_pivot; 

    fprintf(stdout, "Input:\n\n"); 
    for(int i=0; i<n; i++) 
    { 
     for(int j=0; j<n; j++) 
      fprintf(stdout,"%f\t",a[i*n+j]); 
     fprintf(stdout,"\n"); 
    } 

    fprintf(stdout,"\n\n"); 

    invert(a,a,n); 

    fprintf(stdout, "Inverse:\n\n"); 
    for(int i=0; i<n; i++) 
    { 
     for(int j=0; j<n; j++) 
      fprintf(stdout,"%f\t",a[i*n+j]); 
     fprintf(stdout,"\n"); 
    } 

} 

int main() 
{ 
    test_invert(); 

    int n; scanf("%d",&n); 
    return 0; 
}

來源

2014-04-05 sgarizvi

@RobertCrovella ...謝謝你的意見。我面對與CUDA 5.5以及CUDA 6.0 RC相同的行爲。 – sgarizvi

似乎是在目前的CUBLAS庫實現的cublas<t>getrfBatched錯誤的維度矩陣（n）如你所說，有一個「零關鍵」。

一個可能的解決方法是「身份的擴展」你A矩陣將被倒置，當n < 17時，在尺寸17×17（使用MATLAB命名）：

LU = getrf([A 0 ; 0 I]);

繼續，你就可以使用cublas<t>getriBatched在「普通」的方式：（你也可以留在n = 17的一切，叫getri這種方式，然後提取結果作爲第一個3×3的行和列invA）

invA = getri(LU(1:3,1:3))

這裏是一個整個例子，從你提供的代碼借用，顯示你提供的3×3 zero_pivot矩陣的求逆，使用zero_pivot_war矩陣爲「身份擴展」的解決方法：

$ cat t340.cu 
#include <cstdio> 
#include <cstdlib> 
#include <cuda_runtime.h> 
#include <cublas_v2.h> 

#define cudacall(call)                           \ 
    do                               \ 
    {                               \ 
     cudaError_t err = (call);                        \ 
     if(cudaSuccess != err)                         \ 
     {                              \ 
      fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 
      cudaDeviceReset();                         \ 
      exit(EXIT_FAILURE);                         \ 
     }                              \ 
    }                               \ 
    while (0) 

#define cublascall(call)                      \ 
    do                           \ 
    {                           \ 
     cublasStatus_t status = (call);                   \ 
     if(CUBLAS_STATUS_SUCCESS != status)                  \ 
     {                          \ 
      fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);  \ 
      cudaDeviceReset();                     \ 
      exit(EXIT_FAILURE);                     \ 
     }                          \ 
                               \ 
    }                           \ 
    while(0) 


void invert_device(float* src_d, float* dst_d, int n) 
{ 
    cublasHandle_t handle; 
    cublascall(cublasCreate_v2(&handle)); 

    int batchSize = 1; 

    int *P, *INFO; 

    cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int))); 
    cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int))); 

    int lda = 17; 

    float *A[] = { src_d }; 
    float** A_d; 
    cudacall(cudaMalloc<float*>(&A_d,sizeof(A))); 
    cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice)); 

    cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize)); 

    int INFOh = 0; 
    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost)); 

    if(INFOh == 17) 
    { 
     fprintf(stderr, "Factorization Failed: Matrix is singular\n"); 
     cudaDeviceReset(); 
     exit(EXIT_FAILURE); 
    } 

    float* C[] = { dst_d }; 
    float** C_d; 
    cudacall(cudaMalloc<float*>(&C_d,sizeof(C))); 
    cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice)); 

    cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize)); 

    cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost)); 

    if(INFOh != 0) 
    { 
     fprintf(stderr, "Inversion Failed: Matrix is singular\n"); 
     cudaDeviceReset(); 
     exit(EXIT_FAILURE); 
    } 

    cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle); 
} 

void invert(float* src, float* dst, int n) 
{ 
    float* src_d, *dst_d; 

    cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float))); 
    cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice)); 
    cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float))); 

    invert_device(src_d,dst_d,n); 

    cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost)); 

    cudaFree(src_d), cudaFree(dst_d); 
} 

void test_invert() 
{ 
    const int n = 3; 

    //Random matrix with full pivots 
/* float full_pivots[n*n] = { 0.5, 3, 4, 
           1, 3, 10, 
           4 , 9, 16 }; 

    //Almost same as above matrix with first pivot zero 
    float zero_pivot[n*n] = { 0, 3, 4, 
           1, 3, 10, 
           4 , 9, 16 }; 

    float zero_pivot_col_major[n*n] = { 0, 1, 4, 
             3, 3, 9, 
             4 , 10, 16 }; 
*/ 
    float zero_pivot_war[17*17] = { 
             0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
             1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
             4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
             0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 
             0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 
             0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, 
             0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 
             0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, 
             0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 
             0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 
             0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 
             0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, 
             0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0, 
             0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, 
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 
             0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 }; 
/* 
    float another_zero_pivot[n*n] = { 0, 3, 4, 
             1, 5, 6, 
             9, 8, 2 }; 

    float another_full_pivot[n * n] = { 22, 3, 4, 
             1, 5, 6, 
             9, 8, 2 }; 

    float singular[n*n] = {1,2,3, 
          4,5,6, 
          7,8,9}; 
*/ 
    float result[n*n]; 

    //Select matrix by setting "a" 
    float* a = zero_pivot_war; 

    fprintf(stdout, "Input:\n\n"); 
    for(int i=0; i<n; i++) 
    { 
     for(int j=0; j<n; j++) 
      fprintf(stdout,"%f\t",a[i*17+j]); 
     fprintf(stdout,"\n"); 
    } 

    fprintf(stdout,"\n\n"); 

    invert(a,result,n); 

    fprintf(stdout, "Inverse:\n\n"); 
    for(int i=0; i<n; i++) 
    { 
     for(int j=0; j<n; j++) 
      fprintf(stdout,"%f\t",result[i*n+j]); 
     fprintf(stdout,"\n"); 
    } 

} 

int main() 
{ 
    test_invert(); 

// int n; scanf("%d",&n); 
    return 0; 
} 
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas 
$ cuda-memcheck ./t340 
========= CUDA-MEMCHECK 
Input: 

0.000000  3.000000  4.000000 
1.000000  3.000000  10.000000 
4.000000  9.000000  16.000000 


Inverse: 

-0.700000  -0.200000  0.300000 
0.400000  -0.266667  0.066667 
-0.050000  0.200000  -0.050000 
========= ERROR SUMMARY: 0 errors 
$

上面根據簡單測試elsewhere，結果在我看來是正確的。

我沒有關於CUBLAS中可能的錯誤的性質的進一步的技術細節。從我所知道的情況來看，它存在於CUDA 5.5和CUDA 6.0 RC中。有關NVIDIA提供的資產（例如CUBLAS庫）的詳細錯誤討論應該在the NVIDIA developer forums或直接在developer.nvidia.com（您必須是註冊開發者提交錯誤）的錯誤歸檔門戶上進行處理。

來源

2014-04-13 16:24:11

棒極了！謝謝你的解決方法。我現在得到正確的輸出。我希望這個錯誤很快得到解決！ :) – sgarizvi

這似乎是在CUDA 6生產版本（6.0。37在Linux上）這是[現在可用]（http://www.nvidia.com/getcuda） –

謝謝。我在Windows上測試過。該錯誤是固定的，並提供完美的結果。 :) – sgarizvi

CUBLAS：零點偏移矩陣的錯誤反演

回答

相關問題