我試圖找到解決方案,以我的問題使用谷歌,但失敗。有很多片段並不適合我的案例,但我認爲這是一個非常標準的情況。將主機內存複製到cuda __device__變量
我將不得不將幾個不同的數據數組傳輸到cuda。所有這些都是具有動態大小的簡單結構數組。因爲我不想把所有東西都放到cuda內核調用中,我想,__device__
變量應該正是我需要的。
這就是我想我的主機的數據複製到__device__
變量:
// MaterialDescription.h
struct MaterialDescription {
unsigned char type;
unsigned char diffuseR, diffuseG, diffuseB;
__device__ __forceinline__ float4 diffuseColour() const { return make_float4((float) diffuseR/255.f, (float) diffuseG/255.f, (float) diffuseB/255.f, 0); }
};
// kernel.h
__device__ MaterialDescription* g_materials;
__global__ void deferredRenderKernel() {
something = g_materials[indexDependingOnData].diffuseColour();
}
//Cuda.cu
const std::vector<MaterialDescription>& materials = getData();
// version 1
cudaMemcpyToSymbol(g_materials, &materials.front(), sizeof(MaterialDescription) * materialCount);
// version 2
MaterialDescription* ptr;
cudaMalloc((void**)&ptr, sizeof(MaterialDescription) * materialCount);
cudaMemcpy(ptr, &materials.front(), sizeof(MaterialDescription) * materialCount, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(g_materials, ptr, sizeof(MaterialDescription) * materialCount);
// version 3
cudaMalloc((void**)&g_materials, sizeof(MaterialDescription) * materialCount);
cudaMemcpyToSymbol(g_materials, &materials.front(), sizeof(MaterialDescription) * materialCount);
deferredRenderKernel<<<numBlocks, threadsPerBlock>>();
然而,工作的唯一版本包括了內核參數
// kernel.h
__device__ MaterialDescription* g_materials;
__global__
void deferredRenderKernel(MaterialDescription* ptr) {
g_materials = ptr;
something = g_materials[indexDependingOnData].diffuseColour();
}
//Cuda.cu
// version 4, the only one working. but i pass again via kernel param
// in the worst case i'll stick to this, at least i wouldn't have to pass the
// parameters into device functions
MaterialDescription* ptr;
cudaMalloc((void**)&ptr, sizeof(MaterialDescription) * materialCount);
cudaMemcpy(ptr, &materials.front(), sizeof(MaterialDescription) * materialCount, cudaMemcpyHostToDevice);
deferredRenderKernel<<<numBlocks, threadsPerBlock>>(ptr);
編輯: 這個版本(如Robert Crovella所建議的)也可以工作,但內存不會動態分配。
// kernel.h
__device__ MaterialDescription g_materials[VIENNA_MAX_MATERIAL_COUNT];
__global__
void deferredRenderKernel() {
something = g_materials[indexDependingOnData].diffuseColour();
}
// cuda.h
// version 1
cudaMemcpyToSymbol(g_materials, &materials.front(), sizeof(MaterialDescription) * materialCount);
其他變量和結構與上述相同。
編輯:
SOLUTION
它最後的作品只是我想要的方式。
MaterialDescription.h
struct MaterialDescription {
unsigned char type;
unsigned char diffuseR, diffuseG, diffuseB;
__device__ __forceinline__ float4 diffuseColour() const { return make_float4((float) diffuseR/255.f, (float) diffuseG/255.f, (float) diffuseB/255.f, 0); }
};
kernel.h當
__device__ MaterialDescription* g_materials;
__global__ void deferredRenderKernel() {
something = g_materials[indexDependingOnData].diffuseColour();
}
Cuda.cu
const std::vector<MaterialDescription>& materials = getData();
MaterialDescription* dynamicArea;
// allocate memory on the device for our data
cudaMalloc((void**)&dynamicArea, sizeof(MaterialDescription) * materialCount);
// copy our data into the allocated memory
cudaMemcpy(dynamicArea, &materials.front(), sizeof(MaterialDescription) * materialCount, cudaMemcpyHostToDevice);
// copy the pointer to our data into the global __device__ variable.
cudaMemcpyToSymbol(g_materials, &dynamicArea, sizeof(MaterialDescription*));
*如果*你的結構只包含POD類型,那麼你的版本#2幾乎是正確的。只需將最後一個memcpy的大小更改爲正確的大小(它只是您正在複製的指針),並且它應該可以工作。 – talonmies 2014-10-12 13:45:10
不僅尺寸大,而且還需要一個參考:)我會在第二個工作版本中更新。 – Adam 2014-10-12 13:48:19