發送3D陣列CUDA內核

我把給出的答案代碼How can I add up two 2d (pitched) arrays using nested for loops?，並試圖將它用於3D代替2D和改變其他部分稍過，現在它看起來如下：發送3D陣列CUDA內核

__global__ void doSmth(int*** a) { 
    for(int i=0; i<2; i++) 
    for(int j=0; j<2; j++) 
    for(int k=0; k<2; k++) 
    a[i][j][k]=i+j+k; 
} 

int main() { 
    int*** h_c = (int***) malloc(2*sizeof(int**)); 
    for(int i=0; i<2; i++) { 
    h_c[i] = (int**) malloc(2*sizeof(int*)); 
    for(int j=0; j<2; j++) 
    GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int))); 
    } 
    int*** d_c; 
    GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**))); 
    GPUerrchk(cudaMemcpy(d_c,h_c,2*sizeof(int**),cudaMemcpyHostToDevice)); 
    doSmth<<<1,1>>>(d_c); 
    GPUerrchk(cudaPeekAtLastError()); 

    int res[2][2][2]; 
    for(int i=0; i<2; i++) 
    for(int j=0; j<2; j++) 
    GPUerrchk(cudaMemcpy(&res[i][j][0], 
    h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost)); 

    for(int i=0; i<2; i++) 
    for(int j=0; j<2; j++) 
    for(int k=0; k<2; k++) 
    printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]);  
}

在上面的代碼使用2作爲h_c的每個維度的大小，在實際的實現中，我將針對「int ***」或更多維度的子陣列的每個部分，將這些大小設置爲非常大的數目和不同的大小。內核調用後，我嘗試將結果複製回res數組，從而出現問題。你能幫我解決這個問題嗎？你可以用我寫上面的方式展示解決方案。謝謝！

來源

2012-10-16 starter

首先，我認爲，當他發佈你提到的上一個問題的迴應時，他並不打算代表好的編碼。因此，弄清楚如何將其擴展到3D可能不是您最好的時間。例如，我們爲什麼要編寫只使用一個線程的程序？雖然這樣的內核可能有合法的用途，但這不是其中之一。你的內核有可能並行執行一系列獨立工作，但是你可以將它全部強制到一個線程上，然後序列化它。並行工作的定義是：

a[i][j][k]=i+j+k;

讓我們弄清楚如何在GPU上並行處理它。

我會做的另一個介紹性觀察是，由於我們正在處理的問題的大小事先知道，我們使用C來解決它們，並從語言中獲得儘可能多的好處。在某些情況下可能需要嵌套循環做cudaMalloc，但我不認爲這是其中之一。

下面是實現並行的工作代碼：

#include <stdio.h> 
#include <stdlib.h> 
// set a 3D volume 
// To compile it with nvcc execute: nvcc -O2 -o set3d set3d.cu 
//define the data set size (cubic volume) 
#define DATAXSIZE 100 
#define DATAYSIZE 100 
#define DATAZSIZE 20 
//define the chunk sizes that each threadblock will work on 
#define BLKXSIZE 32 
#define BLKYSIZE 4 
#define BLKZSIZE 4 

// for cuda error checking 
#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      return 1; \ 
     } \ 
    } while (0) 

// device function to set the 3D volume 
__global__ void set(int a[][DATAYSIZE][DATAXSIZE]) 
{ 
    unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; 
    unsigned idy = blockIdx.y*blockDim.y + threadIdx.y; 
    unsigned idz = blockIdx.z*blockDim.z + threadIdx.z; 
    if ((idx < (DATAXSIZE)) && (idy < (DATAYSIZE)) && (idz < (DATAZSIZE))){ 
     a[idz][idy][idx] = idz+idy+idx; 
     } 
} 

int main(int argc, char *argv[]) 
{ 
    typedef int nRarray[DATAYSIZE][DATAXSIZE]; 
    const dim3 blockSize(BLKXSIZE, BLKYSIZE, BLKZSIZE); 
    const dim3 gridSize(((DATAXSIZE+BLKXSIZE-1)/BLKXSIZE), ((DATAYSIZE+BLKYSIZE-1)/BLKYSIZE), ((DATAZSIZE+BLKZSIZE-1)/BLKZSIZE)); 
// overall data set sizes 
    const int nx = DATAXSIZE; 
    const int ny = DATAYSIZE; 
    const int nz = DATAZSIZE; 
// pointers for data set storage via malloc 
    nRarray *c; // storage for result stored on host 
    nRarray *d_c; // storage for result computed on device 
// allocate storage for data set 
    if ((c = (nRarray *)malloc((nx*ny*nz)*sizeof(int))) == 0) {fprintf(stderr,"malloc1 Fail \n"); return 1;} 
// allocate GPU device buffers 
    cudaMalloc((void **) &d_c, (nx*ny*nz)*sizeof(int)); 
    cudaCheckErrors("Failed to allocate device buffer"); 
// compute result 
    set<<<gridSize,blockSize>>>(d_c); 
    cudaCheckErrors("Kernel launch failure"); 
// copy output data back to host 

    cudaMemcpy(c, d_c, ((nx*ny*nz)*sizeof(int)), cudaMemcpyDeviceToHost); 
    cudaCheckErrors("CUDA memcpy failure"); 
// and check for accuracy 
    for (unsigned i=0; i<nz; i++) 
     for (unsigned j=0; j<ny; j++) 
     for (unsigned k=0; k<nx; k++) 
      if (c[i][j][k] != (i+j+k)) { 
      printf("Mismatch at x= %d, y= %d, z= %d Host= %d, Device = %d\n", i, j, k, (i+j+k), c[i][j][k]); 
      return 1; 
      } 
    printf("Results check!\n"); 
    free(c); 
    cudaFree(d_c); 
    cudaCheckErrors("cudaFree fail"); 
    return 0; 
}

既然你問它的評論，這裏是改變我可以讓你的代碼才能正常工作的最小數量。我們還要回顧一下您以前提到的問題中的一些talonmies意見：

「對於代碼複雜性和性能方面的原因，您真的不想這麼做，在CUDA代碼中使用指針數組既困難又慢比使用線性存儲器的替代方案要多。「與使用線性內存相比，這是一個糟糕的主意」。

我不得不在紙上畫出來，以確保我的指針複製正確。

#include <cstdio> 
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true) 
{ 
    if (code != 0) { 
     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line); 
     if (Abort) exit(code); 
    } 
} 

#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); } 



__global__ void doSmth(int*** a) { 
    for(int i=0; i<2; i++) 
    for(int j=0; j<2; j++) 
    for(int k=0; k<2; k++) 
    a[i][j][k]=i+j+k; 
} 
int main() { 
    int*** h_c = (int***) malloc(2*sizeof(int**)); 
    for(int i=0; i<2; i++) { 
    h_c[i] = (int**) malloc(2*sizeof(int*)); 
    for(int j=0; j<2; j++) 
    GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int))); 
    } 
    int ***h_c1 = (int ***) malloc(2*sizeof(int **)); 
    for (int i=0; i<2; i++){ 
    GPUerrchk(cudaMalloc((void***)&(h_c1[i]), 2*sizeof(int*))); 
    GPUerrchk(cudaMemcpy(h_c1[i], h_c[i], 2*sizeof(int*), cudaMemcpyHostToDevice)); 
    } 
    int*** d_c; 
    GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**))); 
    GPUerrchk(cudaMemcpy(d_c,h_c1,2*sizeof(int**),cudaMemcpyHostToDevice)); 
    doSmth<<<1,1>>>(d_c); 
    GPUerrchk(cudaPeekAtLastError()); 
    int res[2][2][2]; 
    for(int i=0; i<2; i++) 
    for(int j=0; j<2; j++) 
    GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost)); 

    for(int i=0; i<2; i++) 
    for(int j=0; j<2; j++) 
    for(int k=0; k<2; k++) 
    printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]); 
}

概括地說，我們要做的連續序列：

的malloc指針的多維陣列（在主機上），一個尺寸小於該問題的大小，與最後維作爲一組指向區域cudaMalloc'ed到設備而不是主機。
創建另一個多維指針數組，其類型與上一步創建的類相同，但一個維度小於上一步創建的維度。該陣列還必須具有設備上的最終等級cudaMalloc。
將第二個上一步中的最後一組主機指針複製到上一步中設備上的區域cudaMalloced。
重複步驟2-3，直到我們得到指向多維指針數組的單個（主機）指針，所有指針都駐留在設備上。

來源

2012-10-16 23:31:41

謝謝，PLZ你能告訴我如何解決它，我目前正在做的方式。非常感謝！ – starter

你能提供一個完整的，可編輯的例子，你正在做什麼？對於那些試圖幫助你的人來說，這是一個方便的問題。 –

對不起，如果我沒有正確地問。上面的例子是完整的。如果我在那裏改變任何東西，那將是「h_c」的大小，它可能變成「int ****」，其中每個子陣列將具有非常大和不同的尺寸。謝謝！對不起，如果我問錯了。提前感謝您的幫助。 – starter

發送3D陣列CUDA內核

回答

相關問題