cublas cublasDgetrfBatched（）批處理LU分解不適用於大於32x32的矩陣

我爲Matlab編寫了一個cuda函數，使用cublasDgetrfBatched（）執行一批矩陣的LU分解。該功能的工具包文檔是here。cublas cublasDgetrfBatched（）批處理LU分解不適用於大於32x32的矩陣

它適用於最大尺寸爲32x32的矩陣。但對於更大的矩陣，它的狀態碼CUBLAS_STATUS_INVALID_VALUE失敗。下面是我的源代碼（gpuBatchedLU.cu）：

#include "mex.h" 
#include "gpu/mxGPUArray.h" 

/* Includes, cuda */ 
#include <cuda_runtime.h> 
#include <cublas_v2.h> 

#include <string> 
#include <sstream> 

static std::string cublasGetErrorString(cublasStatus_t error) { 
switch (error) { 
case CUBLAS_STATUS_SUCCESS: 
    return "CUBLAS_STATUS_SUCCESS"; 

case CUBLAS_STATUS_NOT_INITIALIZED: 
    return "CUBLAS_STATUS_NOT_INITIALIZED"; 

case CUBLAS_STATUS_ALLOC_FAILED: 
    return "CUBLAS_STATUS_ALLOC_FAILED"; 

case CUBLAS_STATUS_INVALID_VALUE: 
    return "CUBLAS_STATUS_INVALID_VALUE"; 

case CUBLAS_STATUS_ARCH_MISMATCH: 
    return "CUBLAS_STATUS_ARCH_MISMATCH"; 

case CUBLAS_STATUS_MAPPING_ERROR: 
    return "CUBLAS_STATUS_MAPPING_ERROR"; 

case CUBLAS_STATUS_EXECUTION_FAILED: 
    return "CUBLAS_STATUS_EXECUTION_FAILED"; 

case CUBLAS_STATUS_INTERNAL_ERROR: 
    return "CUBLAS_STATUS_INTERNAL_ERROR"; 
} 

return "<unknown>"; 
} 

inline bool cublasAssert(cublasStatus_t code, const char* file, int line) { 
if (code != CUBLAS_STATUS_SUCCESS) { 
    std::stringstream ss; 
    ss << "cublasAssert: " << cublasGetErrorString(code) << " in " 
    << std::string(file) << ", line " << line << "."; 
    mexErrMsgTxt(ss.str().c_str()); 
} 

return code == CUBLAS_STATUS_SUCCESS; 
} 

inline bool cudaAssert(cudaError_t code, const char* file, int line) { 
if (code != cudaSuccess) { 
    std::stringstream ss; 
    ss << "cudaAssert: " << cudaGetErrorString(code) << " in " 
    << std::string(file) << ", line " << line << "."; 
    mexErrMsgTxt(ss.str().c_str()); 
} 

return code == cudaSuccess; 
} 

inline bool mexGPUAssert(int code, const char* file, int line) { 
if (code != MX_GPU_SUCCESS) { 
    std::stringstream ss; 
    ss << "mexGPUAssert: could not initialize the Mathworks GPU API in " 
    << std::string(file) << ", line " << line << "."; 
    mexErrMsgTxt(ss.str().c_str()); 
} 

return code == MX_GPU_SUCCESS; 
} 

#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); } 
#define cudaErrchk(ans) { cudaAssert((ans), __FILE__, __LINE__); } 
#define mxGPUErrchk(ans) { mexGPUAssert((ans), __FILE__, __LINE__); } 

void mexFunction(int nlhs, mxArray *plhs[], /* Output variables */int nrhs, 
    const mxArray *prhs[]) /* Input variables */{ 
if (nrhs != 1) { /* end if not one function arguments */ 
    mexErrMsgTxt("This function requires one input argument."); 
    return; 
} 

if (nlhs > 3) { /* take three outputs */ 
    mexErrMsgTxt("This function takes a maximum of three output variables."); 
    return; 
} 

mxGPUErrchk(mxInitGPU()); 

const mxGPUArray* in1_gpu = mxGPUCreateFromMxArray(prhs[0]); 
size_t ndims = mxGPUGetNumberOfDimensions(in1_gpu); 
const size_t* dim = (const size_t*) mxGPUGetDimensions(in1_gpu); 

if (ndims != 3) { /* end if input arguments are of different dimensions */ 
    mexErrMsgTxt("The input argument must be a 3-dimensional array."); 
    return; 
} 

cublasHandle_t handle; 

cublasErrchk(cublasCreate(&handle)); 

int no_matrices = dim[2]; 
int nrow = dim[0]; 
int ncol = dim[1]; 
int matrix_size = nrow * ncol; 
size_t i; 

std::stringstream ss; 
ss << "dim[2] = " << dim[2] << "\nno_matrices = " << no_matrices << "\nnrow = " << nrow << "\nmatrix_size = " << nrow << " x " << ncol << " = " << matrix_size << std::endl; 
mexPrintf(ss.str().c_str()); 

mxGPUArray* gpu_array_inout = mxGPUCopyFromMxArray(prhs[0]); 
double* inout_storage = (double*) mxGPUGetData(gpu_array_inout); 

size_t info_dimensions[1] = { no_matrices }; 
mxGPUArray* gpu_array_info = mxGPUCreateGPUArray(1, (mwSize*) info_dimensions, mxINT32_CLASS, mxREAL, 
    MX_GPU_INITIALIZE_VALUES); 
int* out_info = (int*) mxGPUGetData(gpu_array_info); 

mexPrintf("after defining gpu_array_info\n"); 

size_t pivot_dimensions[2] = { nrow, no_matrices }; 
mxGPUArray* gpu_array_pivot = mxGPUCreateGPUArray(2, (mwSize*) pivot_dimensions, mxINT32_CLASS, mxREAL, 
    MX_GPU_DO_NOT_INITIALIZE); 
int* out_pivot = (int*) mxGPUGetData(gpu_array_pivot); 

mexPrintf("after defining gpu_array_pivot\n"); 

double** inout_pointers_CPU = (double**) malloc(no_matrices * sizeof(double*)); 
for (i = 0; i < no_matrices; i++) { 
    inout_pointers_CPU[i] = (double*) ((char*) inout_storage + i * ((size_t) matrix_size) * sizeof(double)); 
} 
double** inout_pointers_GPU; 
cudaErrchk(cudaMalloc((void**)&inout_pointers_GPU, no_matrices * sizeof(double*))); 
cudaErrchk(
    cudaMemcpy(inout_pointers_GPU, inout_pointers_CPU, no_matrices * sizeof(double*), cudaMemcpyHostToDevice)); 
free(inout_pointers_CPU); 

ss.clear(); 
ss << "check again before calling cublasDgetrfBatched:\nnrow = " << nrow << "\nno_matrices = " << no_matrices << std::endl; 
mexPrintf(ss.str().c_str()); 

cublasErrchk(cublasDgetrfBatched(handle, nrow, inout_pointers_GPU, nrow, out_pivot, out_info, no_matrices)); 

cublasErrchk(cublasDestroy(handle)); 

cudaErrchk(cudaFree(inout_pointers_GPU)); 

if (mxIsGPUArray(prhs[0])) { 
    plhs[0] = mxGPUCreateMxArrayOnGPU(gpu_array_inout); 
    if (nlhs > 1) { 
    plhs[1] = mxGPUCreateMxArrayOnGPU(gpu_array_pivot); 
    if (nlhs > 2) { 
    plhs[2] = mxGPUCreateMxArrayOnGPU(gpu_array_info); 
    } 
    } 
} else { 
    plhs[0] = mxGPUCreateMxArrayOnCPU(gpu_array_inout); 
    if (nlhs > 1) { 
    plhs[1] = mxGPUCreateMxArrayOnCPU(gpu_array_pivot); 
    if (nlhs > 2) { 
    plhs[2] = mxGPUCreateMxArrayOnCPU(gpu_array_info); 
    } 
    } 
} 

mxGPUDestroyGPUArray(gpu_array_inout); 
mxGPUDestroyGPUArray(gpu_array_pivot); 
mxGPUDestroyGPUArray(gpu_array_info); 
mxFree((void*) dim); 

return; 
}

我編譯如下：

mex -L/usr/local/cuda/lib64 -lcudart -lcublas gpuBatchedLU.cu

我從MATLAB撥打：

[a1,b1,c1]=gpuBatchedLU(randn(32,32,5)); %no problem 
[a2,b2,c2]=gpuBatchedLU(randn(33,33,5)); %produces CUBLAS_STATUS_INVALID_VALUE

我用Matlab R2013b與並行工具箱，Cuda 5.5和NVS 5200M圖形芯片。

任何人都可以複製此問題嗎？我將不勝感激任何有關如何解決這個問題的建議。

來源

2014-01-11 user3185689

CUBLAS庫用戶指南說'CUBLAS_STATUS_INVALID_VALUE'出現是由於'一個不支持的值或參數被傳遞給了函數（例如一個負向量大小）。要更正：確保所有傳遞的參數具有有效值。「你是否這樣做了？ – JackOLantern

你爲什麼將'inout_storage'強制轉換爲（'char *'）？您的代碼是否返回尺寸小於'32'的正確結果？你正在編譯哪個計算能力？ – JackOLantern

該問題似乎與Matlab R2013b在版本5.0中使用libcublas.so。文件鏈接在/MATLAB/R2013b/bin/glnxa64/。一旦我將Cuda 5.5安裝的鏈接改爲libcublas.so，它就可以正常工作。

來源

2014-01-13 00:07:55 user3185689

cublas cublasDgetrfBatched（）批處理LU分解不適用於大於32x32的矩陣

回答

相關問題