2017-08-16 50 views
-1

我想用cublasDgemm()來計算矩陣的乘積的乘積。輸入矩陣和輸出我從我的代碼期待有以下幾種(A和C分別):cublasDGemm奇怪的結果

| 1 4 7 |  | 66 78 | 
A = | 2 5 8 | C = | 78 93 | 

不過我得到奇怪的結果,這是一個有點困難我理解維度CUBLAS/CUDA用途(專欄)。任何提示將不勝感激!

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda_runtime.h> 
#include "cublas_v2.h" 
#define M 3 
#define N 2 
#define IDX2C(i,j,ld) (((j)*(ld))+(i)) 

int main (void){ 
    cudaError_t cudaStat;  
    cublasStatus_t stat; 
    cublasHandle_t handle; 
    int i, j; 
    double *devPtrA, *devPtrC; 
    double *a = 0, *c = 0; 

    const double alpha = 1; 
    const double beta = 0; 

    // initialize host arrays 
    a = (double *)malloc (M * N * sizeof (*a)); 
    c = (double *)malloc (N * N * sizeof (*c)); 
    if (!a || !c) { 
     printf ("host memory allocation failed"); 
     return EXIT_FAILURE; 
    } 

    // fill input array 
    for (j = 0; j < N; j++) { 
     for (i = 0; i < M; i++) { 
      a[IDX2C(i,j,M)] = (double)(i * M + j + 1); 
      printf ("%7.0f", a[IDX2C(i,j,M)]); 
     } 
     printf ("\n"); 
    } 

    // set device to 0 (for double processing) 
    cudaStat = cudaSetDevice(0); 
    if (cudaStat != cudaSuccess) { 
     printf("could not set device 0"); 
     return EXIT_FAILURE; 
    } 

    // allocate device arrays 
    cudaStat = cudaMalloc ((void**)&devPtrA, M*N*sizeof(*a)); 
    if (cudaStat != cudaSuccess) { 
     printf ("device memory allocation of A failed"); 
     return EXIT_FAILURE; 
    } 
    cudaStat = cudaMalloc ((void**)&devPtrC, N*N*sizeof(*c)); 
    if (cudaStat != cudaSuccess) { 
     printf ("device memory allocation of C failed"); 
     return EXIT_FAILURE; 
    } 

    // create the cublas handle 
    stat = cublasCreate(&handle); 
    if (stat != CUBLAS_STATUS_SUCCESS) { 
     printf ("CUBLAS initialization failed\n"); 
     return EXIT_FAILURE; 
    } 

    // set the matrix a 
    stat = cublasSetMatrix (M, N, sizeof(*a), a, M, devPtrA, M); 
    if (stat != CUBLAS_STATUS_SUCCESS) { 
     printf ("data download failed"); 
     cudaFree (devPtrA); 
     cudaFree (devPtrC); 
     cublasDestroy(handle); 
     return EXIT_FAILURE; 
    } 

    stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, M, M, N, &alpha, devPtrA, M, devPtrA, M, &beta, devPtrC, M); 
    if (stat!= CUBLAS_STATUS_SUCCESS) { 
     switch (stat) { 
      case CUBLAS_STATUS_NOT_INITIALIZED: 
       printf("CUBLAS_STATUS_NOT_INITIALIZED\n"); 
       break; 
      case CUBLAS_STATUS_INVALID_VALUE: 
       printf("CUBLAS_STATUS_INVALID_VALUE\n"); 
       break; 
      case CUBLAS_STATUS_ARCH_MISMATCH: 
       printf("CUBLAS_STATUS_ARCH_MISMATCH\n"); 
       break; 
      case CUBLAS_STATUS_EXECUTION_FAILED: 
       printf("CUBLAS_STATUS_EXECUTION_FAILED\n"); 
       break; 
      default: 
       printf("??\n"); 
     } 

     printf("Error: %d\n", (int)stat); 
     cudaFree (devPtrA); 
     cudaFree (devPtrC); 
     cublasDestroy(handle); 
     return EXIT_FAILURE; 
    } 

    // get matrix c 
    stat = cublasGetMatrix (N, N, sizeof(*c), devPtrC, N, c, N); 
    if (stat != CUBLAS_STATUS_SUCCESS) { 
     printf ("data upload failed"); 
     cudaFree (devPtrC); 
     cublasDestroy(handle); 
     return EXIT_FAILURE; 
    } 

    // cleanup cuda/cublas 
    cudaFree (devPtrA); 
    cudaFree (devPtrC); 
    cublasDestroy(handle); 

    // print result 
    for (j = 0; j < N; j++) { 
     for (i = 0; i < N; i++) { 
      printf ("%7.0f", c[IDX2C(i,j,M)]); 
     } 
     printf ("\n"); 
    } 

    // clear host data 
    free(a); 
    free(c); 
    return EXIT_SUCCESS; 
} 

回答

2

第一個問題是你正在填充矩陣A的行 - 主要格式。要解決這個問題,只需交換i和j索引。在列優先格式,領先的尺寸應的行數,你的情況,N.

for (j = 0; j < N; j++) { 
    for (i = 0; i < M; i++) { 
     a[IDX2C(j,i,N)] = (double)(i * M + j + 1); 
     printf ("%7.0f", a[IDX2C(j,i,N)]); 
    } 
    printf ("\n"); 
} 

您還交換尺寸在cublasDgemm調用,它應該看起來像:

stat = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, N, N, M, &alpha, devPtrA, N, devPtrA, N, &beta, devPtrC, N); 

和結束時,你使用。M爲C矩陣,它應該是n的主要尺寸:

printf ("%7.0f", c[IDX2C(i,j,N)]); 
+0

非常感謝夥計/姑娘! Upvoted和接受! – TheDillo