4
我在調用cuda中的dgetrf時遇到了一些麻煩。從我發現的,我只能稱爲批處理版本(http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-getrfbatched)。當我調用它時,返回的錯誤值爲7,我一直無法找到該錯誤代碼的相應枚舉。下面是我的代碼,任何幫助將不勝感激;Cublas Matrix LU分解
void cuda_matrix_inverse (int m, int n, double* a){
cublasHandle_t handle;
cublasStatus_t status;
double **devPtrA = 0;
double **devPtrA_dev = NULL;
int *d_pivot_array;
int *d_info_array;
int rowsA = m;
int colsA = n;
int matrixSizeA;
cudaError_t error;
fprintf(stderr,"starting cuda inverse\n");
error = cudaMalloc((void **)&d_pivot_array, sizeof(int));
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc((void **)&d_info_array, sizeof(int));
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
fprintf(stderr,"malloced pivot and info\n");
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",status);
matrixSizeA = rowsA * colsA;
devPtrA =(double **)malloc(1 * sizeof(*devPtrA));
fprintf(stderr,"malloced devPtrA\n");
error = cudaMalloc((void **)&devPtrA[0], matrixSizeA * sizeof(devPtrA[0][0]));
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc((void **)&devPtrA_dev, 1 * sizeof(*devPtrA));
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
fprintf(stderr,"malloced device variables\n");
error = cudaMemcpy(devPtrA_dev, devPtrA, 1 * sizeof(*devPtrA), cudaMemcpyHostToDevice);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
fprintf(stderr,"copied from devPtrA to d_devPtrA\n");
status = cublasSetMatrix(rowsA, colsA, sizeof(a[0]), a, rowsA, devPtrA[0], rowsA);
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",status);
status = cublasDgetrfBatched(handle, m, devPtrA_dev,m,d_pivot_array,d_info_array,1); //cannot get this to work
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status);
fprintf(stderr,"done with cuda inverse\n");
}
在這種情況下,spitch和dpitch究竟是什麼?另外,如果我想要自包含,我可以簡單地添加一個cudaMalloc src_d,然後將cudaMemcpy從a到src_d,以便傳入的所有內容都是我的源矩陣? – David
'spitch'和'dpitch'是在使用'cudaMallocPitch'分配矩陣的情況下矩陣的間距。否則,它只會等於'n * sizeof(dataType)'。 – sgarizvi
是的,爲了使它自成一體,您可以在此功能內創建分配設備矩陣。 – sgarizvi