2015-04-24 115 views
0
#include <cuda_runtime.h> 
#include <stdio.h> 



void initialint(int *ip,int size) 
{ 
    for(int i=0;i<size;i++) 
     ip[i]=i; 

} 

void printmatrix(int *C,const int nx,const int ny) 
{ 
    int *ic=C; 
    printf("\n Matrix: (%d.%d) \n",nx,ny); 
    for(int i=0;i<ny;i++){ 
     for(int j=0;j<nx;j++){ 
      printf("%3d",ic[j+nx*i]);} 
    printf("\n"); 

    } 
printf("\n"); 
} 

__global__ void printthreadindex(int *A,const int nx,const int ny) 
{ 
    int ix=threadIdx.x+blockIdx.x*blockDim.x; 
    int iy=threadIdx.y+blockIdx.y*blockDim.y; 

    unsigned int idx=ix+iy*nx; 

    printf("thread_id (%d,%d) block_id (%d,%d) coordinate (%d,%d) global index %2d ival %2d \n",threadIdx.x,threadIdx.y,blockIdx.x,blockIdx.y,ix,iy,idx,A[idx]); 

} 

int main() 
{ 
    int nx=8,ny=6; 
    int nxy=nx*ny; 
    int nBytes=nxy*sizeof(float); 

    int *h_A; 
    h_A=(int *)malloc(nBytes); 

    initialint(h_A,nxy); 
    printmatrix(h_A,nx,ny); 

    int *d_MatA; 
    cudaMalloc((void **)&d_MatA,nBytes); 

    cudaMemcpy(d_MatA,h_A,nBytes,cudaMemcpyHostToDevice); 
    dim3 block(4,2); 
    dim3 grid(2,3); 
    printthreadindex <<<grid,block>>> (d_MatA,nx,ny); 

    cudaFree(d_MatA); 
    free(h_A); 

    system("pause"); 
    return 0; 



} 

輸出:CUDA 2D陣列映射

Matrix: (8.6) 
    0 1 2 3 4 5 6 7 
    8 9 10 11 12 13 14 15 
16 17 18 19 20 21 22 23 
24 25 26 27 28 29 30 31 
32 33 34 35 36 37 38 39 
40 41 42 43 44 45 46 47 

thread_id (0,0) block_id (1,0) coordinate (4,0) global index 4 ival 4 
thread_id (1,0) block_id (1,0) coordinate (5,0) global index 5 ival 5 
thread_id (2,0) block_id (1,0) coordinate (6,0) global index 6 ival 6 
thread_id (3,0) block_id (1,0) coordinate (7,0) global index 7 ival 7 
thread_id (0,1) block_id (1,0) coordinate (4,1) global index 12 ival 12 
thread_id (1,1) block_id (1,0) coordinate (5,1) global index 13 ival 13 
thread_id (2,1) block_id (1,0) coordinate (6,1) global index 14 ival 14 
thread_id (3,1) block_id (1,0) coordinate (7,1) global index 15 ival 15 
thread_id (0,0) block_id (1,1) coordinate (4,2) global index 20 ival 20 
thread_id (1,0) block_id (1,1) coordinate (5,2) global index 21 ival 21 
thread_id (2,0) block_id (1,1) coordinate (6,2) global index 22 ival 22 
thread_id (3,0) block_id (1,1) coordinate (7,2) global index 23 ival 23 
thread_id (0,1) block_id (1,1) coordinate (4,3) global index 28 ival 28 
thread_id (1,1) block_id (1,1) coordinate (5,3) global index 29 ival 29 
thread_id (2,1) block_id (1,1) coordinate (6,3) global index 30 ival 30 
thread_id (3,1) block_id (1,1) coordinate (7,3) global index 31 ival 31 
thread_id (0,0) block_id (0,2) coordinate (0,4) global index 32 ival 32 
thread_id (1,0) block_id (0,2) coordinate (1,4) global index 33 ival 33 
thread_id (2,0) block_id (0,2) coordinate (2,4) global index 34 ival 34 
thread_id (3,0) block_id (0,2) coordinate (3,4) global index 35 ival 35 
thread_id (0,1) block_id (0,2) coordinate (0,5) global index 40 ival 40 
thread_id (1,1) block_id (0,2) coordinate (1,5) global index 41 ival 41 
thread_id (2,1) block_id (0,2) coordinate (2,5) global index 42 ival 42 
thread_id (3,1) block_id (0,2) coordinate (3,5) global index 43 ival 43 
thread_id (0,0) block_id (1,2) coordinate (4,4) global index 36 ival 36 
thread_id (1,0) block_id (1,2) coordinate (5,4) global index 37 ival 37 
thread_id (2,0) block_id (1,2) coordinate (6,4) global index 38 ival 38 
thread_id (3,0) block_id (1,2) coordinate (7,4) global index 39 ival 39 
thread_id (0,1) block_id (1,2) coordinate (4,5) global index 44 ival 44 
thread_id (1,1) block_id (1,2) coordinate (5,5) global index 45 ival 45 
thread_id (2,1) block_id (1,2) coordinate (6,5) global index 46 ival 46 
thread_id (3,1) block_id (1,2) coordinate (7,5) global index 47 ival 47 
thread_id (0,0) block_id (0,1) coordinate (0,2) global index 16 ival 16 
thread_id (1,0) block_id (0,1) coordinate (1,2) global index 17 ival 17 
thread_id (2,0) block_id (0,1) coordinate (2,2) global index 18 ival 18 
thread_id (3,0) block_id (0,1) coordinate (3,2) global index 19 ival 19 
thread_id (0,1) block_id (0,1) coordinate (0,3) global index 24 ival 24 
thread_id (1,1) block_id (0,1) coordinate (1,3) global index 25 ival 25 
thread_id (2,1) block_id (0,1) coordinate (2,3) global index 26 ival 26 
thread_id (3,1) block_id (0,1) coordinate (3,3) global index 27 ival 27 
thread_id (0,0) block_id (0,0) coordinate (0,0) global index 0 ival 0 
thread_id (1,0) block_id (0,0) coordinate (1,0) global index 1 ival 1 
thread_id (2,0) block_id (0,0) coordinate (2,0) global index 2 ival 2 
thread_id (3,0) block_id (0,0) coordinate (3,0) global index 3 ival 3 
thread_id (0,1) block_id (0,0) coordinate (0,1) global index 8 ival 8 
thread_id (1,1) block_id (0,0) coordinate (1,1) global index 9 ival 9 
thread_id (2,1) block_id (0,0) coordinate (2,1) global index 10 ival 10 
thread_id (3,1) block_id (0,0) coordinate (3,1) global index 11 ival 11 

您好,上面的代碼是從CUDA本書,試圖解釋的2D陣列是如何映射到CUDA網格和塊並打印矩陣的示例座標和全局內存中的偏移量爲每個線程。

我對這些線程到底有多精確,特別是 語句「idx = ix + iy nx」有點困惑。我嘗試交換nx,ny的索引值,然後將此語句更改爲「idx = iy + ix ny」,但這似乎不起作用。

而且映射到線程作爲

塊矩陣元素(0,0)-0,1,2,3,8,9,10,11 塊(1,0)-4,5- ,6,7,12,13,14,15 .....

如果我要像

塊(0,0)-0,1,2,3,4,5的映射, 6,7 Block(0,1)-8,9,10,11,12,13,14,15 ....

如何修改參數並啓動內核。

P.S-我在Windows 8.1上使用帶有VS 2012的GTX 860M的i7處理器。

謝謝。

回答

0
nx=8, ny=6. 

idx=ix+iy*nx:以座標爲(5,2)的線程爲例。 ix=5, iy=2,所以ival=5+2*8=21

爲了啓動使用不同的參數內核,你應該改變dim3 block(a,b)dim3 block(c,d)說明。

例如,爲了實現你的榜樣,你應該使用:

dim3 block(8,1); 
dim3 grid(1,6);