2017-04-03 35 views
0

假設我有一個結構如下:複製陣列從主機到設備的CUDA

typedef struct values{ 
int one, int two, int three 
} values; 

現在,假設我在主機上創建值的陣列,並用隨機數據

values vals*; 
__device__ values* d_vals; 
int main(){ 
    vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER); 
    PopulateWithDate(); //populates vals with random data 
} 
填充

現在,我希望能夠將值複製到設備上,這樣我可以訪問他們在我的內核像這樣:

__global__ void myKernel(){ 
    printf("%d", d_vals[0].one);//I don't really want to print, but whenever I try to access I get an error 
} 

Whate ver我嘗試我得到一個非法的內存訪問被遇到錯誤。

這是我當前的嘗試:

int main(){ 
    vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER); 
    PopulateWithDate(); //populates vals with random data 

    values* d_ptr; 
    cudaGetSymbolAddress((void**)&d_ptr, d_vals); 
    cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values)); 

    cudaMemcpyToSymbol(d_ptr, &vals, sizeof(values) * A_LARGE_NUMBER); 
    cudaDeviceSynchronize(); 
    dim3 blocksPerGrid(2, 2); 
    dim3 threadsPerBlock(16, 16); 

    myKernel<< <blocksPerGrid, threadsPerBlock >> >(); 
} 

回答

1

對於到目前爲止你已經證明什麼,使用__device__指針變量只是創建不必要的複雜性。只需使用cudaMalloc用於設備存儲的普通動態分配,然後按照類似於任何CUDA示例代碼(如vectorAdd)的方法使用。這裏有一個例子:

$ cat t1315.cu 
#include <stdio.h> 
#define A_LARGE_NUMBER 10 

struct values{ 
int one, two, three; 
}; 

values *vals; 

__global__ void myKernel(values *d_vals){ 
    printf("%d\n", d_vals[0].one); 
} 

void PopulateWithData(){ 
    for (int i = 0; i < A_LARGE_NUMBER; i++){ 
    vals[i].one = 1; 
    vals[i].two = 2; 
    vals[i].three = 3; 
    } 
} 


int main(){ 
    vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER); 
    PopulateWithData(); //populates vals with random data 

    values* d_ptr; 
    cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values)); 
    cudaMemcpy(d_ptr, vals, A_LARGE_NUMBER *sizeof(values),cudaMemcpyHostToDevice); 
    dim3 blocksPerGrid(1,1); 
    dim3 threadsPerBlock(1, 1); 

    myKernel<< <blocksPerGrid, threadsPerBlock >> >(d_ptr); 
    cudaDeviceSynchronize(); 
} 
$ nvcc -arch=sm_35 -o t1315 t1315.cu 
$ cuda-memcheck ./t1315 
========= CUDA-MEMCHECK 
1 
========= ERROR SUMMARY: 0 errors 
$ 

您有一些其它的基本(非CUDA)在你已經表明了什麼,我不會嘗試,並通過他們所有運行的代碼錯誤。

如果你真的想留住你__device__指針變量,並用它來指向設備的數據(結構的數組),那麼你還需要使用cudaMalloc,整體過程需要額外的步驟。你可以按照解答here中的例子。

此之後例如,這裏有一組更改上面的代碼,使其與__device__指針變量,而不是作爲內核參數傳遞的指針工作:

$ cat t1315.cu 
#include <stdio.h> 
#define A_LARGE_NUMBER 10 

struct values{ 
int one, two, three; 
}; 

values *vals; 
__device__ values *d_vals; 

__global__ void myKernel(){ 
    printf("%d\n", d_vals[0].one); 
} 

void PopulateWithData(){ 
    for (int i = 0; i < A_LARGE_NUMBER; i++){ 
    vals[i].one = 1; 
    vals[i].two = 2; 
    vals[i].three = 3; 
    } 
} 


int main(){ 
    vals = (values*)malloc(sizeof(values) * A_LARGE_NUMBER); 
    PopulateWithData(); //populates vals with random data 

    values* d_ptr; 
    cudaMalloc((void**)&d_ptr, A_LARGE_NUMBER * sizeof(values)); 
    cudaMemcpy(d_ptr, vals, A_LARGE_NUMBER *sizeof(values),cudaMemcpyHostToDevice); 
    cudaMemcpyToSymbol(d_vals, &d_ptr, sizeof(values*)); 
    dim3 blocksPerGrid(1,1); 
    dim3 threadsPerBlock(1, 1); 

    myKernel<< <blocksPerGrid, threadsPerBlock >> >(); 
    cudaDeviceSynchronize(); 
} 
$ nvcc -arch=sm_35 -o t1315 t1315.cu 
$ cuda-memcheck ./t1315 
========= CUDA-MEMCHECK 
1 
========= ERROR SUMMARY: 0 errors 
$ 
+0

嗨。感謝您的有益迴應。對不起,我的問題不太清楚。事情是我必須使用設備指針,因爲myKernel不會從main調用。相反,它會在我從外部代碼收到一個隨機事件後被調用。換句話說,我將無法將d_ptr作爲參數傳遞給內核,我必須在某處保留對其的引用 – William

+0

確定這些更改相對較小,因此我添加了一個演示變化的示例。 –

相關問題