2013-08-29 101 views
0

我正在嘗試將2d數組傳遞給內核,以便每個線程都可以訪問index = threadIdx.x +(blockIdx.x * blockDim.x),但我無法計算出如何做到這一點以及如何將數據複製回來。管理2D CUDA陣列

size_t pitch; 
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int)); 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost); 
for (num_blocks) 
    for(block_size) 
    h_array[block][thread] should be 1 

__global__ void kernel(int *array, int pitch) { 
    int *row = (int*)((char*)array + blockIdx.x * pitch); 
    row[threadIdx.x] = 1; 
    return; 
} 

我在做什麼錯,在這裏?

+0

爲什麼要將數組轉換爲(char *)?這將導致一個錯誤的指針算術 – LarryPel

+0

這就是它在這兩個問題中描述的: http://stackoverflow.com/questions/1047369/allocate-2d-array-on-device-memory-in-cuda http: //stackoverflow.com/questions/5029920/how-to-use-2d-arrays-in-cuda – user1743798

+0

@LarryPel:不,它不會。間距以字節爲單位,並且需要指向字節大小的類型的指針才能正確執行指針計算。 – talonmies

回答

1

您的cudaMemset2D正在佔用您之前通過cudaMallocPitch分配的更大的內存空間另外,您的cudaMemcpy2D正在複製該內存的一小部分。

你應該以下面的方式使用該函數:

cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost); 
+0

此外,如果您已經完成了[適當的cuda錯誤檢查](http://stackoverflow.com/questions/14038589/what -is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api)你的'cudaMemset2D'(至少)會引發錯誤。 –

+0

此外,大概你的'h_array'沒有投入。因此,您在cudaMemcpy2D中傳遞的'pitch'參數不正確。你應該傳遞'block_size * sizeof(int)'(或者類似的東西)作爲'h_array'的音高。 –

1

下面是經過與@hidrargyro提到的錯誤的基本測試了固定的完整代碼:

$ cat t236.cu 
#include <stdio.h> 

#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 


__global__ void kernel(int *array, int pitch) { 
    int *row = (int*)((char*)array + blockIdx.x * pitch); 
    row[threadIdx.x] = 1; 
    return; 
} 

int main(){ 

int *d_array, *h_array; 
int block_size = 256; 
int num_blocks = 256; 
int grid_size = num_blocks; 
h_array=(int *)malloc(block_size*num_blocks*sizeof(int)); 
if (h_array==0) {printf("malloc fail\n"); return 1;} 
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int)); 
cudaCheckErrors("cudaMalloc fail"); 

size_t pitch; 
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaCheckErrors("cudaMallocPitch fail"); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks); 
cudaCheckErrors("cudaMemset2D fail"); 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaDeviceSynchronize(); 
cudaCheckErrors("kernel fail"); 

cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost); 
cudaCheckErrors("cudaMemcpy 2D fail"); 
for (int i = 0; i<num_blocks; i++) 
    for(int j = 0; j<block_size; j++) 
    if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;} 
printf("success\n"); 
return 0; 
} 

$ nvcc -arch=sm_20 -o t236 t236.cu 
$ ./t236 
success 
$ 

如果您打算接受答案,請接受@hidrargyro給出的答案

+0

這就像CUDA_SAFE_CALL(),我總是懷疑while(0) – hidrargyro