這不是明擺着如何使用std ::載體,CUDA,所以我設計我自己的Vector類:與向量處理 - cudaMemcpyDeviceToHost
#ifndef VECTORHEADERDEF
#define VECTORHEADERDEF
#include <cmath>
#include <iostream>
#include <cassert>
template <typename T>
class Vector
{
private:
T* mData; // data stored in vector
int mSize; // size of vector
public:
Vector(const Vector& otherVector); // Constructor
Vector(int size); // Constructor
~Vector(); // Desructor
__host__ __device__ int GetSize() const; // get size of the vector
T& operator[](int i); // see element
// change element i
__host__ __device__ void set(size_t i, T value) {
mData[i] = value;
}
template <class S> // output vector
friend std::ostream& operator<<(std::ostream& output, Vector<S>& v);
};
// Overridden copy constructor
// Allocates memory for new vector, and copies entries of other vector into it
template <typename T>
Vector<T>::Vector(const Vector& otherVector)
{
mSize = otherVector.GetSize();
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
}
// Constructor for vector of a given size
// Allocates memory, and initialises entries to zero
template <typename T>
Vector<T>::Vector(int size)
{
assert(size > 0);
mSize = size;
mData = new T [mSize];
for (int i=0; i<mSize; i++)
{
mData[i] = 0.0;
}
}
// Overridden destructor to correctly free memory
template <typename T>
Vector<T>::~Vector()
{
delete[] mData;
}
// Method to get the size of a vector
template <typename T>
__host__ __device__ int Vector<T>::GetSize() const
{
return mSize;
}
// Overloading square brackets
// Note that this uses `zero-based' indexing, and a check on the validity of the index
template <typename T>
T& Vector<T>::operator[](int i)
{
assert(i > -1);
assert(i < mSize);
return mData[i];
}
// Overloading the assignment operator
template <typename T>
Vector<T>& Vector<T>::operator=(const Vector& otherVector)
{
assert(mSize == otherVector.mSize);
for (int i=0; i<mSize; i++)
{
mData[i] = otherVector.mData[i];
}
return *this;
}
// Overloading the insertion << operator
template <typename T>
std::ostream& operator<<(std::ostream& output, Vector<T>& v) {
for (int i=0; i<v.mSize; i++) {
output << v[i] << " ";
}
return output;
}
我的主要功能 - 在這裏我只是傳遞一個向量到設備,修改它,並將它傳遞迴 - 如下(只用於測試目的而設計的內核):
#include <iostream>
#include "Vector.hpp"
__global__ void alpha(Vector<int>* d_num)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;
d_num->set(0,100);
d_num->set(2,11);
}
int main()
{
Vector<int> num(10);
for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9
std::cout << "Size of vector: " << num.GetSize() << "\n";
std::cout << num << "\n"; // print vector
Vector<int>* d_num;
// allocate global memory on the device
cudaMalloc((void **) &d_num, num.GetSize()*sizeof(int));
// copy data from host memory to the device memory
cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice);
// launch the kernel
alpha<<<1,100>>>(d_num);
// copy the modified array back to the host, overwriting the contents of h_arr
cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);
std::cout << num << "\n";
// free GPU memory allocation and exit
cudaFree(d_num);
return 0;
}
我遇到的問題是與cudaMemcpyDeviceToHost。它不會真正將設備向量複製到num向量,如從輸出中可以看到的那樣。
我應該如何處理? (請明確說明,我對CUDA相當陌生)。
非常感謝,羅伯特。你的回答非常好。內核只是實驗性的。 推力矢量是否有類似的設計? 我現在可以聲明(並傳遞給設備)一個變量類型:Vector
推力矢量沒有類似的設計。 Thrust有一個主機矢量和一個獨立的設備矢量類。他們不會將主機和設備存儲結合到一個類中。然而,設計是(IMO)相當乾淨,顯然比你在這裏有更多的充實。要了解更多關於推力矢量的信息,請嘗試推力[快速入門指南](https://github.com/thrust/thrust/wiki/Quick-Start-Guide)。另外推力是開源。Niether這裏的方法和推力(設備)矢量將很容易地讓你處理矢量矢量,但更簡單的矢量對象是可能的。 –