我在從設備到主機的內核上計算後返回二維結構。從設備到主機的cudaMemcpy錯誤
HANDLE_ERROR(cudaMemcpy(Pixel,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
像素聲明主機,Pixel_gpu是如下分配的設備:
**Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
pixel **Pixel = (pixel**)malloc((img_ht)*sizeof(pixel*));
for(int i=0;i<(img_ht);i++)
Pixel[i]=(pixel*)malloc((img_wd)*sizeof(pixel));
使用這個我最終得到非法的內存訪問錯誤。
嘗試一個類似的內存對齊結果,也沒有幫助。
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
內核中啓動:
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDeviceProperties(&prop, 0));
int thread_block=sqrt(prop.maxThreadsPerBlock);
dim3 DimGrid(ceil(img_wd/thread_block),ceil(img_ht/thread_block),1);
dim3 DimBlock(sqrt(prop.maxThreadsPerBlock),sqrt(prop.maxThreadsPerBlock),1);
//allocating gpu memory
pixel **Pixel_tmp_gpu, **Pixel_gpu;
HANDLE_ERROR(cudaMalloc(&Pixel_tmp_gpu,img_wd*img_ht*sizeof(pixel)));
HANDLE_ERROR(cudaMalloc(&Pixel_gpu,img_wd*img_ht*sizeof(pixel)));
float **kernel0_gpu, **kernel1_gpu;
HANDLE_ERROR(cudaMalloc(&kernel0_gpu,k*1*sizeof(float)));
HANDLE_ERROR(cudaMalloc(&kernel1_gpu,1*k*sizeof(float)));
cout<<"memory allocated"<<endl;
//copying needed data
HANDLE_ERROR(cudaMemcpy(Pixel_tmp_gpu,Pixel_tmp,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(Pixel_gpu,Pixel,img_wd*img_ht*sizeof(pixel),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel0_gpu,kernel0,k*1*sizeof(float),cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(kernel1_gpu,kernel1,1*k*sizeof(float),cudaMemcpyHostToDevice));
cout<<"memory transfers done"<<endl;
vertical_conv<<<DimGrid,DimBlock>>>(Pixel_gpu, Pixel_tmp_gpu,img_wd, img_ht,kernel0_gpu,k);
time_t vertical_convolution=time(NULL);
cout<<" vertical_convolution time: "<<double(vertical_convolution - reading_file)<<"sec"<<endl;
horizontal_conv<<<DimGrid,DimBlock>>>(Pixel_tmp_gpu, Pixel_gpu, img_wd, img_ht, kernel1_gpu, k);
time_t horizontal_convolution=time(NULL);
cout<<" horizontal convolution time:" <<double(horizontal_convolution-vertical_convolution)<<" sec"<<endl;
pixel *Pixel_res = (pixel*)malloc(img_wd*img_ht*sizeof(pixel));
HANDLE_ERROR(cudaMemcpy(Pixel_res,Pixel_gpu,img_wd*img_ht*sizeof(pixel),cudaMemcpyDeviceToHost));
使用的函數:
struct pixel //to store RGB values
{
unsigned char r;
unsigned char g;
unsigned char b;
};
static void HandleError(cudaError_t err, const char *file, int line) {
if (err != cudaSuccess) {
cout<<cudaGetErrorString(err)<<" in "<< file <<" at line "<< line<<endl;
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
__device__ void padding(pixel** Pixel_val, int x_coord, int y_coord, int img_width, int img_height, pixel Px) //padding the image,depending on pixel coordinates, can be replaced by reflect for better result //currently zero padding
{
if(x_coord<img_width && y_coord<img_height && x_coord>=0 && y_coord>=0)
Px=Pixel_val[y_coord][x_coord];
}
垂直卷積:
__global__ void vertical_conv(pixel** Pixel_in, pixel** Pixel_out,int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_g, tmp_b;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
if(row<img_ht && col<img_wd){
tmp_r=0, tmp_g=0, tmp_b=0;
for(int l=0;l<k;l++)
{
padding(Pixel_in, col, row+l-(k-1)/2, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[l][0];
tmp_b+=pix_val.b * kernel[l][0];
tmp_g+=pix_val.g * kernel[l][0];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
水平卷積:
__global__ void horizontal_conv(pixel** Pixel_in, pixel** Pixel_out, int img_wd, int img_ht, float** kernel, int k)
{
float tmp_r, tmp_b, tmp_g;
pixel pix_val;
pix_val.r=0;pix_val.g=0;pix_val.b=0;
//horizontal convolution
int row=blockIdx.y*blockDim.y + threadIdx.y;
int col = blockIdx.x*blockDim.x + threadIdx.x;
tmp_r=0, tmp_g=0, tmp_b=0;
if(row<img_ht && col<img_wd)
{
for(int l=0; l<k;l++)
{
padding(Pixel_in, col+l-(k-1)/2, row, img_wd, img_ht, pix_val);
tmp_r+=pix_val.r * kernel[0][l];
tmp_g+=pix_val.g * kernel[0][l];
tmp_b+=pix_val.b * kernel[0][l];
}
Pixel_out[row][col].r=tmp_r;
Pixel_out[row][col].g=tmp_g;
Pixel_out[row][col].b=tmp_b;
}
}
有人可以幫助我知道這裏可能是錯的嗎?
你不能傳遞一個扁平的單指針('*')分配給內核,並期望把它用作雙指針('**')數組。在主機和設備之間傳遞雙指針數組需要特殊的編碼,這是你缺少的。這是一個經常被誤解的話題,所以有很多問題在討論。您可以搜索'cuda二維數組'或'cuda'標籤信息頁面,以鏈接到討論如何處理二維數組的標準問題。我相信這仍然不是[mcve]。它應該是別人可以編譯和運行的東西,不需要大量的組裝工作。 –
如果你只是將所有東西都轉換成單指針數組,並且模擬2D訪問(即[[row * width + col]'),那麼你將會大大簡化你的工作。 –
我會研究二維數組處理cuda,感謝您的建議。 –