與向量處理 - cudaMemcpyDeviceToHost

這不是明擺着如何使用std ::載體，CUDA，所以我設計我自己的Vector類：與向量處理 - cudaMemcpyDeviceToHost

#ifndef VECTORHEADERDEF 
#define VECTORHEADERDEF 

#include <cmath> 
#include <iostream> 
#include <cassert> 

template <typename T> 
class Vector 
{ 
private: 
    T* mData; // data stored in vector 
    int mSize; // size of vector 
public: 
     Vector(const Vector& otherVector); // Constructor 
     Vector(int size); // Constructor 
     ~Vector(); // Desructor 

     __host__ __device__ int GetSize() const; // get size of the vector 

     T& operator[](int i); // see element 

     // change element i 
     __host__ __device__ void set(size_t i, T value) { 
       mData[i] = value; 
     } 

     template <class S> // output vector 
     friend std::ostream& operator<<(std::ostream& output, Vector<S>& v); 
}; 


// Overridden copy constructor 
// Allocates memory for new vector, and copies entries of other vector into it 
template <typename T> 
Vector<T>::Vector(const Vector& otherVector) 
{ 
    mSize = otherVector.GetSize(); 
    mData = new T [mSize]; 
    for (int i=0; i<mSize; i++) 
    { 
     mData[i] = otherVector.mData[i]; 
    } 
} 

// Constructor for vector of a given size 
// Allocates memory, and initialises entries to zero 
template <typename T> 
Vector<T>::Vector(int size) 
{ 
    assert(size > 0); 
    mSize = size; 
    mData = new T [mSize]; 
    for (int i=0; i<mSize; i++) 
    { 
     mData[i] = 0.0; 
    } 
} 

// Overridden destructor to correctly free memory 
template <typename T> 
Vector<T>::~Vector() 
{ 
    delete[] mData; 
} 

// Method to get the size of a vector 
template <typename T> 
__host__ __device__ int Vector<T>::GetSize() const 
{ 
    return mSize; 
} 

// Overloading square brackets 
// Note that this uses `zero-based' indexing, and a check on the validity of the index 
template <typename T> 
T& Vector<T>::operator[](int i) 
{ 
     assert(i > -1); 
     assert(i < mSize); 
     return mData[i]; 
} 

// Overloading the assignment operator 
template <typename T> 
Vector<T>& Vector<T>::operator=(const Vector& otherVector) 
{ 
    assert(mSize == otherVector.mSize); 
    for (int i=0; i<mSize; i++) 
    { 
     mData[i] = otherVector.mData[i]; 
    } 
    return *this; 
} 

// Overloading the insertion << operator 
template <typename T> 
std::ostream& operator<<(std::ostream& output, Vector<T>& v) { 
    for (int i=0; i<v.mSize; i++) { 
     output << v[i] << " "; 
    } 
    return output; 
}

我的主要功能 - 在這裏我只是傳遞一個向量到設備，修改它，並將它傳遞迴 - 如下（只用於測試目的而設計的內核）：

#include <iostream> 

#include "Vector.hpp" 


__global__ void alpha(Vector<int>* d_num) 
{ 
     int myId = threadIdx.x + blockDim.x * blockIdx.x; 


     d_num->set(0,100); 
     d_num->set(2,11); 
} 


int main() 
{ 
     Vector<int> num(10); 

     for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9 

     std::cout << "Size of vector: " << num.GetSize() << "\n"; 
     std::cout << num << "\n"; // print vector 

     Vector<int>* d_num; 

     // allocate global memory on the device 
     cudaMalloc((void **) &d_num, num.GetSize()*sizeof(int)); 

     // copy data from host memory to the device memory 
     cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice); 


     // launch the kernel 
     alpha<<<1,100>>>(d_num); 


     // copy the modified array back to the host, overwriting the contents of h_arr 
     cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost); 

     std::cout << num << "\n"; 


     // free GPU memory allocation and exit 
     cudaFree(d_num); 

     return 0; 
}

我遇到的問題是與cudaMemcpyDeviceToHost。它不會真正將設備向量複製到num向量，如從輸出中可以看到的那樣。

我應該如何處理？（請明確說明，我對CUDA相當陌生）。

來源

2017-04-19 Sandu Ursu

這將創建一個有效的指針矢量num的第一個元素：

cudaMemcpy(d_num, &num[0], num.GetSize()*sizeof(int), cudaMemcpyHostToDevice); 
         ^^^^^^^

這不會：

cudaMemcpy(num, &d_num[0], num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost); 
       ^^^

一個你Vector對象的名稱是不是一個指向它的第一個數據元素。相反，你應該寫在一個類似的方式該行你寫的第一個，像這樣：

cudaMemcpy(&num[0], d_num, num.GetSize()*sizeof(int), cudaMemcpyDeviceToHost);

然而，這本身並不是一個修復。請注意，d_num不是Vector，但已經是一個指針，所以我們可以直接在這些操作中使用它。儘管使用&(d_num[0])沒有錯，但沒有必要這樣做。

因爲d_num不是Vector（如您已分配 - 這是裸指針一套int量），你的內核Vector方法使用率也斷了。如果你想在內核中使用Vector方法，你需要傳遞一個實際的Vector對象，而不僅僅是數據。由於傳遞對象需要在對象中處理設備數據（設備上不能訪問主機上可訪問的數據，反之亦然），因此它是對類的大量重寫。我已經做了有限的嘗試，展現了一條可能的出路。基本方法（即一種可能的方法）如下：

該對象將包含指向數據的主機副本和設備數據副本的指針。
在對象實例化中，我們將分配兩者，並最初將我們的「引用」指針設置爲指向主機副本。
在設備上使用之前，我們必須將主機數據複製到設備數據中，並使用to_device()方法實現此目的。此方法還會切換我們的「參考」指針（mData），以指向Vector數據的設備端副本。
除了將主機數據複製到對象的「內部」設備數據之外，我們還必須使對象本身可在設備上使用。爲此，我們通過指向設備端副本的指針複製對象本身（d_num）。
然後，我們可以在設備上以通常的方式使用該對象，對於那些具有__device__裝飾的方法。
內核完成後，我們必須更新數據的主機副本並將我們的「參考」指針切換回主機數據。爲此提供了to_host()方法。
此後，可以在主機代碼中再次使用該對象，反映數據更改（如果發生在內核中）。

下面是一個樣例：

$ cat t101.cu 
#include <iostream> 

#include <cmath> 
#include <iostream> 
#include <cassert> 

template <typename T> 
class Vector 
{ 
private: 
    T* mData, *hData, *dData; // data stored in vector 

    int mSize; // size of vector 
public: 
     Vector(const Vector& otherVector); // Constructor 
     Vector(int size); // Constructor 
     ~Vector(); // Desructor 

     __host__ __device__ int GetSize() const; // get size of the vector 
     __host__ __device__ T& operator[](int i); // see element 

     // change element i 
     __host__ __device__ void set(size_t i, T value) { 
       mData[i] = value; 
     }; 

     __host__ __device__ Vector<T>& operator=(const Vector<T>& otherVector); 
     void to_device(); 
     void to_host(); 
     template <class S> // output vector 
     friend std::ostream& operator<<(std::ostream& output, Vector<S>& v); 
}; 


// Overridden copy constructor 
// Allocates memory for new vector, and copies entries of other vector into it 
template <typename T> 
Vector<T>::Vector(const Vector& otherVector) 
{ 
    mSize = otherVector.GetSize(); 
    hData = new T [mSize]; 
    cudaMalloc(&dData, mSize*sizeof(T)); 
    mData = hData; 
    for (int i=0; i<mSize; i++) 
    { 
     mData[i] = otherVector.mData[i]; 
    } 
} 

// Constructor for vector of a given size 
// Allocates memory, and initialises entries to zero 
template <typename T> 
Vector<T>::Vector(int size) 
{ 
    assert(size > 0); 
    mSize = size; 
    hData = new T [mSize]; 
    cudaMalloc(&dData, mSize*sizeof(T)); 
    mData = hData; 
    for (int i=0; i<mSize; i++) 
    { 
     mData[i] = 0.0; 
    } 
} 

// Overridden destructor to correctly free memory 
template <typename T> 
Vector<T>::~Vector() 
{ 
    delete[] hData; 
    if (dData) cudaFree(dData); 
} 

// Method to get the size of a vector 
template <typename T> 
__host__ __device__ 
int Vector<T>::GetSize() const 
{ 
    return mSize; 
} 

// Overloading square brackets 
// Note that this uses `zero-based' indexing, and a check on the validity of the index 
template <typename T> 
__host__ __device__ 
T& Vector<T>::operator[](int i) 
{ 
     assert(i > -1); 
     assert(i < mSize); 
     return mData[i]; 
} 

// Overloading the assignment operator 
template <typename T> 
__host__ __device__ 
Vector<T>& Vector<T>::operator=(const Vector<T>& otherVector) 
{ 
    assert(mSize == otherVector.mSize); 
    for (int i=0; i<mSize; i++) 
    { 
     mData[i] = otherVector.mData[i]; 
    } 
    return *this; 
} 

// Overloading the insertion << operator 
// not callable on the device! 
template <typename T> 
std::ostream& operator<<(std::ostream& output, Vector<T>& v) { 
    for (int i=0; i<v.mSize; i++) { 
     output << v[i] << " "; 
    } 
    return output; 
} 

template <typename T> 
void Vector<T>::to_device(){ 
    cudaMemcpy(dData, hData, mSize*sizeof(T), cudaMemcpyHostToDevice); 
    mData = dData; 
} 

template <typename T> 
void Vector<T>::to_host(){ 
    cudaMemcpy(hData, dData, mSize*sizeof(T), cudaMemcpyDeviceToHost); 
    mData = hData; 
} 

__global__ void alpha(Vector<int> *d_num) 
{ 


     d_num->set(0,100); 
     d_num->set(2,11); 
     (*d_num)[1] = 50; 
} 


int main() 
{ 
     Vector<int> num(10); 

     for (int i=0; i < num.GetSize(); ++i) num.set(i,i); // initialize elements to 0:9 

     std::cout << "Size of vector: " << num.GetSize() << "\n"; 
     std::cout << num << "\n"; // print vector 

     Vector<int> *d_num; 
     cudaMalloc(&d_num, sizeof(Vector<int>)); 

     num.to_device(); 
     cudaMemcpy(d_num, &(num), sizeof(Vector<int>), cudaMemcpyHostToDevice); 
     // launch the kernel 
     alpha<<<1,1>>>(d_num); 


     // copy the modified array back to the host, overwriting the contents of h_arr 
     num.to_host(); 

     std::cout << num << "\n"; 


     // free GPU memory allocation and exit 

     return 0; 
} 
$ nvcc -arch=sm_61 -o t101 t101.cu 
$ cuda-memcheck ./t101 
========= CUDA-MEMCHECK 
Size of vector: 10 
0 1 2 3 4 5 6 7 8 9 
100 50 11 3 4 5 6 7 8 9 
========= ERROR SUMMARY: 0 errors 
$

注：

根據我的測試，你貼的代碼有各種各樣的編譯錯誤，所以我不得不做出其他改變你Vector類只是爲了讓它編譯。
將值傳遞給內核會調用複製構造函數，然後調用析構函數，這會讓事情變得更加困難，因此我選擇了通過指針傳遞對象（這就是您最初擁有的方式），爲了避免這一點。
你的內核調用啓動了100個線程。由於他們都在做同樣的事情，沒有任何閱讀活動，所以沒有什麼特別的錯誤，但我已經改變它只是一個單一的線程。它仍然表現出相同的能力。

來源

2017-04-19 12:51:37

非常感謝，羅伯特。你的回答非常好。內核只是實驗性的。推力矢量是否有類似的設計？我現在可以聲明（並傳遞給設備）一個變量類型：Vector

與向量處理 - cudaMemcpyDeviceToHost

回答

相關問題