2016-01-27 93 views
0

我有大小10x20的輸入矩陣的置換,我想如下來排列其列:矩陣列與CUBLAS

p=[1 4 2 3 5 11 7 13 6 12 8 14 17 9 15 18 10 16 19 20] ;%rearrange the columns of A 
    A=A(:,p); 

爲此,我構造對應於該置換矢量p我的置換矩陣並且可以通過執行以下乘法來獲得排列A:

A=A*I 

我在Matlab中測試了排列,一切正常。現在,我想用cublas在cuda中測試它。

輸入矩陣A輸入列主要。主專欄中的排列矩陣I也是如此。以下代碼僅用於測試排列:

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 


#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cublas_v2.h> 


#define cudacall(call)                           \ 
    do                               \ 
    {                               \ 
     cudaError_t err = (call);                        \ 
     if(cudaSuccess != err)                         \ 
     {                              \ 
      fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 
      cudaDeviceReset();                         \ 
      exit(EXIT_FAILURE);                         \ 
     }                              \ 
    }                               \ 
    while (0) 

#define cublascall(call)                      \ 
    do                           \ 
    {                           \ 
     cublasStatus_t status = (call);                   \ 
     if(CUBLAS_STATUS_SUCCESS != status)                  \ 
     {                          \ 
      fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status);  \ 
      cudaDeviceReset();                     \ 
      exit(EXIT_FAILURE);                     \ 
     }                          \ 
                               \ 
    }                           \ 
    while(0) 
    __global__ void sgemm_kernel(float *A_d, float *I_d) 
    { 

     int m=10,n=20,k=20; 
     int lda=k, ldb=k; 
     cublasHandle_t hdl; 
     cublasStatus_t status = cublasCreate_v2(&hdl); 
     const float alpha=1.0F, beta=0.0f; 
     status=cublasSgemm(hdl,CUBLAS_OP_N,CUBLAS_OP_N,k,n,k,&alpha,A_d,lda,I_d,ldb,&beta,A_d,lda); 

    } 

    int main(int argc, char* argv[]) 
    {float A[10*20]={-0.0614, -0.0199, 0.0024, -0.0414, 0.1736, -0.0595, -0.2794, 0.1946, -0.0647, -0.0025, 
    -0.0036, 0.0628, -0.0827, 0.3679, -0.1913, 0.0500, -0.0245, 0.3855, -0.1298, -0.0334, 
    -0.0241, -0.0564, 0.0098, -0.2862, -0.0474, 0.0333, -0.3049, 0.2851, -0.1242, 0.0162, 
    0.0241, 0.0270, -0.0670, 0.3129, -0.2428, 0.0947, -0.1878, 0.0889, -0.0208, 0.0075, 
    -0.1559, 0.1437, -0.1916, 0.2297, -0.0833, -0.1805, 0.2522, -0.1738, 0.1027, -0.1273, 
    0.0716, 0.1882, -0.0963, 0.1081, 0.0958, -0.0713, 0.1931, 0.0874, -0.4186, 0.0345, 
    -0.1912, 0.0501, -0.1396, -0.0989, -0.0338, 0.1773, 0.1088, 0.0389, -0.0117, 0.0014, 
    0.1648, -0.1705, -0.0575, -0.0133, -0.0570, 0.2124, -0.0193, 0.1535, 0.0857, -0.1308, 
    0.1971, 0.0882, -0.2577, 0.1662, -0.2498, -0.0365, -0.1805, 0.0921, 0.0912, 0.0178, 
    -0.0379, 0.0080, 0.0572, -0.0067, 0.0591, -0.0136, 0.0471, -0.0163, 0.0082, -0.0338, 
    -0.2436, 0.1116, 0.0732, -0.0319, 0.0550, 0.2821, 0.0240, 0.0109, -0.0034, 0.1212, 
    -0.0061, 0.2497, -0.0542, -0.0939, 0.0651, 0.0063, -0.1367, 0.0580, 0.7389, -0.1143, 
    -0.3786, 0.1288, 0.0001, 0.2604, -0.1094, -0.3624, -0.0184, 0.0538, 0.0329, 0.0040, 
    0.0603, 0.1422, 0.1037, -0.1846, 0.4046, -0.3738, -0.3487, 0.3846, -0.0849, 0.0135, 
    -0.1850, 0.3571, -0.0543, -0.0025, -0.2880, 0.0600, 0.2605, -0.0474, 0.0010, -0.0333, 
    -0.1974, 0.4788, -0.2441, 0.3847, -0.1235, -0.3503, -0.1785, -0.1095, 0.3158, 0.0062, 
    -0.0509, -0.0502, 0.2154, 0.2237, -0.0671, 0.0377, 0.0519, 0.1530, -0.1675, 0.1856, 
    -0.0380, -0.0026, 0.4700, 0.0097, -0.2394, 0.0717, -0.2101, 0.2841, -0.1799, -0.0924, 
    -0.2678, 0.4485, 0.0044, 0.0030, -0.0439, 0.4337, 0.1819, -0.0180, -0.5443, 0.0864, 
    0.0390, -0.0235, -0.0706, 0.0138, 0.0633, -0.0147, 0.0444, -0.0334, 0.0557, 0.0507} 

    float I[20*20]={1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; 

    float *A_d, *I_d; 

    cudacall(cudaMalloc(&A_d,10*20*sizeof(float))); 
    cudacall(cudaMalloc(&I_d, 20*20*sizeof(float ))); 
    cudacall(cudaMemcpy(A_d, A, 10*20*sizeof(float), cudaMemcpyHostToDevice)); 
    cudacall(cudaMemcpy(I_d, I, 20*20*sizeof(float), cudaMemcpyHostToDevice)); 

    sgemm_kernel<<<1,1>>>(A_d, I_d); 
    cudacall(cudaDeviceSynchronize()); 

    cudacall(cudaMemcpy(A, A_d, 10*20*sizeof(float), cudaMemcpyDeviceToHost)); 
    cudacall(cudaFree(A_d)); 
    cudacall(cudaFree(I_d)); 

    return 0; 
    } 

我無法得到正確的結果。

+1

「我無法得到正確的結果。」沒有太多的問題描述,你沒有提供[mcve](http://stackoverflow.com/help/mcve)。投票結束。 – talonmies

+0

感謝您的評論,我編輯了我的問題並添加了一個簡單的示例。 – Sinem

回答

0

CUBLAS不支持就地操作(實際上沒有我知道支持它的並行BLAS)。您不能通過A_d並將其用於乘法中並作爲操作中的矩陣。你必須必須使用不同的內存分配來保存結果。

所以

C <- 1*(A * B) + 0*C 

是合法的,而

A <- 1*(A * B) + 0*A 

不是。

-1

cublasSgemm是一個宿主函數,所以它應該從一個沒有__global__限定符的函數中調用。

+1

我使用動態並行性從設備內調用cublas函數。我的cuda卡的計算能力爲5.0 – Sinem