儘管你的抗議,相反,C++標準庫complex
(或thrust::complex
)肯定不會有CUBLAS工作。 cuComplex
和cuDoubleComplex
被設計爲與標準主機複雜類型二進制兼容,以便數據在傳遞到在設備上使用複雜數據的CUBLAS函數時不會被翻譯。
一個簡單的修改,你在發表的評論代碼工作完全按照自己的想象:
#include <algorithm>
#include <iostream>
#include <complex>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
complex<double> u[xmax][xmax];
double arrSize = sizeof(complex<double>) * xmax * xmax;
fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0));
u[49][51] += complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
complex<double> alpha(1.0, 0.0);
complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
建成和運行,像這樣:
~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas
~/SO$ ./complex_transpose
Before:
(666,666)
(2,2)
After:
(2,2)
(666,666)
唯一需要修改的顯式轉換std::complex<double>
類型到cuDoubleComplex
。這樣做和一切都按預期工作。
使用推力,代碼看起來幾乎相同:
#include <iostream>
#include <thrust/fill.h>
#include <thrust/complex.h>
#include <cublas_v2.h>
using namespace std;
int main()
{
int xmax = 100;
thrust::complex<double> u[xmax][xmax];
double arrSize = sizeof(thrust::complex<double>) * xmax * xmax;
thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0));
u[49][51] += thrust::complex<double>(665.0,665.0);
u[51][49] *= 2.0;
cout << "Before:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
cuDoubleComplex* d_u;
cuDoubleComplex* d_v;
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cudaMalloc(&d_u, arrSize);
cudaMalloc(&d_v, arrSize);
cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, d_u, xmax,
_beta, d_u, xmax,
d_v, xmax);
cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost);
cout << "After:" << endl;
cout << u[49][51] << endl;
cout << u[51][49] << endl;
return 0;
}
也許更接近於你的使用情況,使用推力裝置的容器具有內核CUBLAS呼叫之前執行一些初始化:
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/complex.h>
#include <thrust/execution_policy.h>
#include <thrust/copy.h>
#include <cublas_v2.h>
__global__ void setup_kernel(thrust::complex<double>* u, int xmax)
{
u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0);
u[49 + 51*xmax] *= 2.0;
}
int main()
{
int xmax = 100;
thrust::complex<double> alpha(1.0, 0.0);
thrust::complex<double> beta(0.0, 0.0);
cublasHandle_t handle;
cublasCreate(&handle);
thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0));
thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.));
setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax);
cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data()));
cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data()));
cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha);
cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta);
cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax,
_alpha, _d_u, xmax,
_beta, _d_u, xmax,
_d_v, xmax);
thrust::complex<double> u[xmax][xmax];
thrust::copy(d_u.begin(), d_u.end(), &u[0][0]);
std::cout << "Before:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
thrust::copy(d_v.begin(), d_v.end(), &u[0][0]);
std::cout << "After:" << std::endl;
std::cout << u[49][51] << std::endl;
std::cout << u[51][49] << std::endl;
return 0;
}
你不能使用C++標準庫的複雜類型和函數嗎? – talonmies
這是我試過的,它似乎沒有工作https://pastebin.com/hCjPvdBm –
@talonmies我已閱讀此文檔:http://docs.nvidia.com/cuda/cublas/#cublas-lt- t-gt-geam。我必須承認我有機會誤解它,但我也檢查了一些工作示例 –