管理2D CUDA陣列

我正在嘗試將2d數組傳遞給內核，以便每個線程都可以訪問index = threadIdx.x +（blockIdx.x * blockDim.x），但我無法計算出如何做到這一點以及如何將數據複製回來。管理2D CUDA陣列

size_t pitch; 
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int)); 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost); 
for (num_blocks) 
    for(block_size) 
    h_array[block][thread] should be 1 

__global__ void kernel(int *array, int pitch) { 
    int *row = (int*)((char*)array + blockIdx.x * pitch); 
    row[threadIdx.x] = 1; 
    return; 
}

我在做什麼錯，在這裏？

來源

2013-08-29 user1743798

爲什麼要將數組轉換爲（char *）？這將導致一個錯誤的指針算術 – LarryPel

這就是它在這兩個問題中描述的： http://stackoverflow.com/questions/1047369/allocate-2d-array-on-device-memory-in-cuda http： //stackoverflow.com/questions/5029920/how-to-use-2d-arrays-in-cuda – user1743798

@LarryPel：不，它不會。間距以字節爲單位，並且需要指向字節大小的類型的指針才能正確執行指針計算。 – talonmies

您的cudaMemset2D正在佔用您之前通過cudaMallocPitch分配的更大的內存空間另外，您的cudaMemcpy2D正在複製該內存的一小部分。

你應該以下面的方式使用該函數：

cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost);

來源

2013-08-29 17:10:49 hidrargyro

此外，如果您已經完成了[適當的cuda錯誤檢查]（http://stackoverflow.com/questions/14038589/what -is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api）你的'cudaMemset2D'（至少）會引發錯誤。 –

此外，大概你的'h_array'沒有投入。因此，您在cudaMemcpy2D中傳遞的'pitch'參數不正確。你應該傳遞'block_size * sizeof（int）'（或者類似的東西）作爲'h_array'的音高。 –

下面是經過與@hidrargyro提到的錯誤的基本測試了固定的完整代碼：

$ cat t236.cu 
#include <stdio.h> 

#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 


__global__ void kernel(int *array, int pitch) { 
    int *row = (int*)((char*)array + blockIdx.x * pitch); 
    row[threadIdx.x] = 1; 
    return; 
} 

int main(){ 

int *d_array, *h_array; 
int block_size = 256; 
int num_blocks = 256; 
int grid_size = num_blocks; 
h_array=(int *)malloc(block_size*num_blocks*sizeof(int)); 
if (h_array==0) {printf("malloc fail\n"); return 1;} 
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int)); 
cudaCheckErrors("cudaMalloc fail"); 

size_t pitch; 
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaCheckErrors("cudaMallocPitch fail"); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks); 
cudaCheckErrors("cudaMemset2D fail"); 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaDeviceSynchronize(); 
cudaCheckErrors("kernel fail"); 

cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost); 
cudaCheckErrors("cudaMemcpy 2D fail"); 
for (int i = 0; i<num_blocks; i++) 
    for(int j = 0; j<block_size; j++) 
    if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;} 
printf("success\n"); 
return 0; 
} 

$ nvcc -arch=sm_20 -o t236 t236.cu 
$ ./t236 
success 
$

如果您打算接受答案，請接受@hidrargyro給出的答案

來源

2013-08-29 18:13:52

這就像CUDA_SAFE_CALL（），我總是懷疑while（0） – hidrargyro

管理2D CUDA陣列

回答

相關問題