2013-07-05 76 views
1

首先,我很抱歉,如果我的語法是可怕的, 我有一個問題處理二維數組在不同維度(160x320)。聲明二維數組進程的維數和線程數

dim3 blocks(DIMX/16,DIMZ/32); 
dim3 threads(16,16); 

這段代碼編譯得很好,但不知何故只處理了160x160,剩下的數組仍然爲零。我做錯了嗎?

#include "cuda.h" 
#include "conio.h" 
#include <fstream> 
#include <sstream> 
#include <iostream> 
#include <assert.h> 
#include "../common/book.h" 
#define DIMX 160 
#define DIMZ 320 
#define PI 3.1415926535897932f 
#define dx 1.0 
#define dz 1.0 
#define dt 0.001 
#define samp 500 
#define nite 1000 


__global__ void txz_kernel(float *txz,float *vz) 
{ 
int x = threadIdx.x + blockIdx.x * blockDim.x; 
int y = threadIdx.y + blockIdx.y * blockDim.y; 
int offset = x + y * blockDim.x * gridDim.x; 
vz[offset]=txz[offset]+vz[offset]; 
} 

int main(void) 
{ 
    float    *txz; 
    float    *vz; 

     HANDLE_ERROR(cudaMalloc((void**)&txz, DIMX * DIMZ * sizeof(float))); 
     HANDLE_ERROR(cudaMalloc((void**)&vz, DIMX * DIMZ * sizeof(float))); 

     float *tempvz = (float*)malloc(sizeof(float)*(DIMX*DIMZ));  
     float *temptxz = (float*)malloc(sizeof(float)*(DIMX*DIMZ)); 

    for (int i=0; i<DIMX; i++) { 
     for (int j=0; j<DIMZ; j++) { 
     int ij=DIMX*j + i; 
     tempvz[ij]=0.0; 
     temptxz[ij]=100.0; 
     } 
    } 

    for (int i=0; i<DIMX; i++) { 
     for (int j=(121); j<DIMZ; j++) { 
     int ij=DIMX*j + i;  
     tempvz[ij]=0.0; 
     temptxz[ij]=150.0; 
     } 
    } 
      HANDLE_ERROR(cudaMemcpy(vz, tempvz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice)); 
      HANDLE_ERROR(cudaMemcpy(txz, temptxz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice)); 
       dim3 blocks(DIMX/16,DIMZ/32); 
       dim3 threads(16,16); 

      txz_kernel<<<blocks,threads>>>(txz,vz) ;    

      float *tempse = (float*)malloc(sizeof(float)*(DIMX*DIMZ)); 
      HANDLE_ERROR(cudaMemcpy(tempse, vz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyDeviceToHost)); 
      std::ofstream outseis("contour.ctxt"); // output, normal file 
      for (int jj=0; jj<DIMZ; jj++) 
      { 
       for (int ii=0; ii<DIMX; ii++) 
       { 
       int ij=DIMX*jj + ii;    
       outseis<<tempse[ij]<<" "; 
       } 
       outseis<<"\r\n"; 
      } 
} 

回答

1

該行塊(DIMX/16,DIMZ/32);應該是塊(DIMX/16,DIMZ/16);如果一切是很好..

還要檢查它應該是

int ij=DIMZ*i + j; 

如果妳在一個行優先順序處理索引。如果你在列主要訂單處理你寫的是正確的。

下面是我們的代碼被編譯和給出正確的結果,即,將所述2個陣列,並找到和的稍作修改的版本給我102400(160×320 + 160×320)

放烏爾自己數字和檢查..

注意:這是爲行主要秩序。

#include "cuda.h" 
#include <fstream> 
#include <sstream> 
#include <iostream> 
#include <assert.h> 
#define DIMX 160 
#define DIMZ 320 
#define PI 3.1415926535897932f 
#define dx 1.0 
#define dz 1.0 
#define dt 0.001 
#define samp 500 
#define nite 1000 

__global__ void txz_kernel(float *txz,float *vz) 
{ 
     int x = threadIdx.x + blockIdx.x * blockDim.x; 
     int y = threadIdx.y + blockIdx.y * blockDim.y; 
     int offset = (x * blockDim.x * gridDim.x) + y ; 


    if (offset < (DIMX * DIMZ)) 
    { 
      vz[offset]=txz[offset]+vz[offset]; 
    } 
    else 
    { 
      printf ("Offset going out of the bounds\n") ; 
    } 

}

int main(void) 
{ 
    float    *txz; 
float    *vz; 
float    sum = 0.0 ; 
float    *tempse ; 

    HANDLE_ERROR(cudaMalloc((void**)&txz, DIMX * DIMZ * sizeof(float))); 
    HANDLE_ERROR(cudaMalloc((void**)&vz, DIMX * DIMZ * sizeof(float))); 

    float *tempvz = (float*)malloc(sizeof(float)*(DIMX*DIMZ)); 
    float *temptxz = (float*)malloc(sizeof(float)*(DIMX*DIMZ)); 

for (int i=0; i<DIMX; i++) { 
    for (int j=0; j<DIMZ; j++) { 
    int ij=DIMZ*i + j; 
    tempvz[ij]=1.0; 
    temptxz[ij]=1.0; 
    } 
} 

    cudaMemcpy(txz, temptxz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice) ; 
    cudaMemcpy(vz, tempvz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyHostToDevice) ; 

      dim3 blocks(DIMX/16,DIMZ/16); 
      dim3 threads(16,16); 

     txz_kernel<<<blocks,threads>>>(txz,vz) ; 
     //cudaDeviceSynchronize() ; 

     tempse = (float*)malloc(sizeof(float)*(DIMX*DIMZ)); 
     HANDLE_ERROR(cudaMemcpy(tempse, vz,sizeof(float)*(DIMX*DIMZ),cudaMemcpyDeviceToHost)); 

     for (int jj=0; jj<DIMX; jj++) 
     { 
      for (int ii=0; ii<DIMZ; ii++) 
      { 
        int ij=DIMZ*jj + ii; 
        sum += tempse[ij] ; 
      } 
     } 

    printf ("The sum is %f\n", sum) ; 
}