2017-04-16 38 views
0

在我的代碼中,我使用了來自推力庫的複數數組,並且我想使用cublasZgeam()來轉置數組。使用來自Thrust的複數的cuBLAS

使用cuComplex.h中的複數不是一個可取的選擇,因爲我在數組上做了很多算術運算,cuComplex沒有定義運算符,如* + =。

我這是怎麼定義的數組,我要轉

thrust::complex<float> u[xmax][xmax]; 

我發現這個https://github.com/jtravs/cuda_complex,但使用它是這樣:

#include "cuComplex.hpp" 

犯規讓我用提到的運營商進行編譯時與nvcc

error: no operator "+=" matches these operands 
     operand types are: cuComplex += cuComplex 

有沒有一些解決方案呢?從github代碼是舊的,可​​能存在這個問題,也許我使用它是錯誤的

編輯:這裏是代碼工作,唯一區別talonmies代碼是添加簡單的內核和指針,以相同的數據,但被刺::複雜

#include <iostream> 
#include <thrust/fill.h> 
#include <thrust/complex.h> 
#include <cublas_v2.h> 

using namespace std; 

__global__ void test(thrust::complex<double>* u) { 

    u[0] += thrust::complex<double>(3.3,3.3); 
} 

int main() 
{ 
    int xmax = 100; 
    thrust::complex<double> u[xmax][xmax]; 
    double arrSize = sizeof(thrust::complex<double>) * xmax * xmax; 

    thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0)); 
    u[49][51] += thrust::complex<double>(665.0,665.0); 
    u[51][49] *= 2.0; 

    cout << "Before:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 
    cout << u[0][0] << endl; 

    thrust::complex<double> alpha(1.0, 0.0); 
    thrust::complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    cuDoubleComplex* d_u; 
    cuDoubleComplex* d_v; 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 
    cudaMalloc(&d_u, arrSize); 
    cudaMalloc(&d_v, arrSize); 
    cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice); 
    thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v); 
    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, d_u, xmax, 
        _beta, d_u, xmax, 
        d_v, xmax); 
    test<<<1,1>>>(d_vTest); 
    cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost); 
    cout << "After:" << endl; 
    cout << u[0][0] << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    return 0; 
} 
+0

你不能使用C++標準庫的複雜類型和函數嗎? – talonmies

+0

這是我試過的,它似乎沒有工作https://pastebin.com/hCjPvdBm –

+0

@talonmies我已閱讀此文檔:http://docs.nvidia.com/cuda/cublas/#cublas-lt- t-gt-geam。我必須承認我有機會誤解它,但我也檢查了一些工作示例 –

回答

2

儘管你的抗議,相反,C++標準庫complex(或thrust::complex)肯定不會有CUBLAS工作。 cuComplexcuDoubleComplex被設計爲與標準主機複雜類型二進制兼容,以便數據在傳遞到在設備上使用複雜數據的CUBLAS函數時不會被翻譯。

一個簡單的修改,你在發表的評論代碼工作完全按照自己的想象:

#include <algorithm> 
#include <iostream> 
#include <complex> 
#include <cublas_v2.h> 

using namespace std; 

int main() 
{ 
    int xmax = 100; 
    complex<double> u[xmax][xmax]; 
    double arrSize = sizeof(complex<double>) * xmax * xmax; 

    fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0)); 
    u[49][51] += complex<double>(665.0,665.0); 
    u[51][49] *= 2.0; 

    cout << "Before:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    complex<double> alpha(1.0, 0.0); 
    complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    cuDoubleComplex* d_u; 
    cuDoubleComplex* d_v; 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 
    cudaMalloc(&d_u, arrSize); 
    cudaMalloc(&d_v, arrSize); 
    cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice); 
    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, d_u, xmax, 
        _beta, d_u, xmax, 
        d_v, xmax); 

    cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost); 

    cout << "After:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    return 0; 
} 

建成和運行,像這樣:

~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas 
~/SO$ ./complex_transpose 
Before: 
(666,666) 
(2,2) 
After: 
(2,2) 
(666,666) 

唯一需要修改的顯式轉換std::complex<double>類型到cuDoubleComplex。這樣做和一切都按預期工作。

使用推力,代碼看起來幾乎相同:

#include <iostream> 
#include <thrust/fill.h> 
#include <thrust/complex.h> 
#include <cublas_v2.h> 

using namespace std; 

int main() 
{ 
    int xmax = 100; 
    thrust::complex<double> u[xmax][xmax]; 
    double arrSize = sizeof(thrust::complex<double>) * xmax * xmax; 

    thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0)); 
    u[49][51] += thrust::complex<double>(665.0,665.0); 
    u[51][49] *= 2.0; 

    cout << "Before:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    thrust::complex<double> alpha(1.0, 0.0); 
    thrust::complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    cuDoubleComplex* d_u; 
    cuDoubleComplex* d_v; 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 
    cudaMalloc(&d_u, arrSize); 
    cudaMalloc(&d_v, arrSize); 
    cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice); 
    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, d_u, xmax, 
        _beta, d_u, xmax, 
        d_v, xmax); 

    cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost); 

    cout << "After:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    return 0; 
} 

也許更接近於你的使用情況,使用推力裝置的容器具有內核CUBLAS呼叫之前執行一些初始化:

#include <iostream> 
#include <thrust/device_vector.h> 
#include <thrust/complex.h> 
#include <thrust/execution_policy.h> 
#include <thrust/copy.h> 
#include <cublas_v2.h> 

__global__ void setup_kernel(thrust::complex<double>* u, int xmax) 
{ 
    u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0); 
    u[49 + 51*xmax] *= 2.0; 
} 

int main() 
{ 
    int xmax = 100; 

    thrust::complex<double> alpha(1.0, 0.0); 
    thrust::complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0)); 
    thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.)); 
    setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax); 

    cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data())); 
    cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data())); 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 

    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, _d_u, xmax, 
        _beta, _d_u, xmax, 
        _d_v, xmax); 

    thrust::complex<double> u[xmax][xmax]; 

    thrust::copy(d_u.begin(), d_u.end(), &u[0][0]); 
    std::cout << "Before:" << std::endl; 
    std::cout << u[49][51] << std::endl; 
    std::cout << u[51][49] << std::endl; 

    thrust::copy(d_v.begin(), d_v.end(), &u[0][0]); 
    std::cout << "After:" << std::endl; 
    std::cout << u[49][51] << std::endl; 
    std::cout << u[51][49] << std::endl; 

    return 0; 

} 
+0

非常感謝。我沒有澄清它,但我需要在轉置後的內核中使用這些運算符,並且爲了清晰起見而省略。我希望理解你的概念能夠幫助我實現這一目標。 –

+1

@MaxK:如果你想在設備上進行操作,使用'thrust :: complex'。它在功能上與'std :: complex'相同,但具有'__device__'綁定。我的答案中發佈的代碼基本上沒有區別,無論您使用哪種類型。 – talonmies

相關問題