2012-11-20 67 views
1

我正在嘗試創建一個數組結構,並將數組加載到GPU中。我想我按照步驟正確地做到了這一點。將數組結構數組加載到cuda

  1. 使用malloc在CPU上創建一個結構體。
  2. cudamalloc數組到結構。
  3. 使用cudamalloc在GPU上創建一個結構
  4. 將CPU結構複製到GPU結構中。

當我運行此代碼時,只要我不更改內核函數中的值p[i].c[0],它就會正常工作。如果我刪除行p[i].c[0] = 3.3;那麼它輸出預期的結果。當我離開它時,它會輸出所有值的隨機數。我希望能夠使用內核函數更新數組中的值。

什麼可能是錯的?

這裏是我的代碼:

#include <stdio.h> 
#include <cuda_runtime.h> 
#include <iostream> 
#include <fstream> 
#include <sstream> 
#include <cstdio> 
#include <fcntl.h> 
#include <unistd.h> 
#include <assert.h> 
#include <omp.h> 
#include <vector> 
#include <sys/time.h> 

    float cData[] 
       { 
         1, 
         2, 
         3, 
         4, 
         5, 
         6, 
         7, 
         8, 
         9, 
         10, 
         11, 
         12, 
         13, 
         14, 
         15, 
         16 
       }; 
    float dData[] 
       { 
         1, 
         2, 
         3, 
         4, 
         5, 
         6, 
         7, 
         8, 
         9, 
         10, 
         11, 
         12, 
         13, 
         14, 
         15, 
         16 
       }; 

    typedef struct 
      { 
       float a, b; 
       float* c; 
       float* d; 
      } point; 

__global__ void testKernel(point *p){ 
    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    p[i].a = 1.1; 
    p[i].b = 2.2; 
    p[i].c[0] = 3.3; 
} 

void checkerror(cudaError_t error, char* descrp){ 
    if (error != 0){ 

     printf("%s error code: %d \n", descrp, error); 
    } 

} 

extern "C" int main() 
{ 
    printf("starting gpuCode\n"); 
    int *dev_a; 
      // set number of points 
     int numPoints = 16, 
      gpuBlockSize = 4, 
      pointSize = sizeof(point), 
      numBytes  = numPoints * pointSize, 
      gpuGridSize = numPoints/gpuBlockSize; 
    cudaError_t err = cudaSuccess; 
    printf("initialized variables\n"); 
      // allocate memory 
     point *cpuPointArray, 
       *gpuPointArray, 
       *outPointArray; 
     cpuPointArray = (point*)malloc(numBytes); //create the cpuPointArray struct on the cpu 
     outPointArray = (point*)malloc(numBytes); //create the outPointArray struct on the cpu 
     printf("load cpuPointArray struct with default values\n"); 

     for (int k=0; k<16; k++){ 
      err = cudaMalloc((void**)&cpuPointArray[k].c, 16*sizeof(float)); 
      checkerror(err, "assigning cuda pointer c"); 
      err = cudaMalloc((void**)&cpuPointArray[k].d, 16*sizeof(float)); 
      checkerror(err, "assigning cuda pointer d"); 
      cpuPointArray[k].a = 16; 
      cpuPointArray[k].b = 16; 
     } 


     for (int k=0; k<16; k++){ 
      printf("top loop %d\n", k); 
      err = cudaMemcpy(cpuPointArray[k].c, cData, 16*sizeof(float), cudaMemcpyHostToDevice); 
      printf("after cdata\n"); 
      checkerror(err, "copying cdata to gpu array c"); 
      err = cudaMemcpy(cpuPointArray[k].d, dData, 16*sizeof(float), cudaMemcpyHostToDevice); 
      printf("after ddata\n"); 
      checkerror(err, "copying ddata to gpu array d"); 
      printf("bottom of loop %d\n", k); 
     } 

     err = cudaMalloc((void**)&gpuPointArray, numBytes); //allocate memory on the gpu for the cpu point array 
     checkerror(err, "allocating memory for gpuPointArray"); 
     err = cudaMemcpy(gpuPointArray,cpuPointArray,sizeof(cpuPointArray), cudaMemcpyHostToDevice); //copy the cpu point array onto the gpu 
     checkerror(err, "copying cpuPointArray to gpuPointArray"); 

     printf("loaded the struct into the kernel\n"); 

     for(int i = 0; i < numPoints; ++i) 
       { 
        printf("point.a: %f, point.b: %f ************************\n",cpuPointArray[i].a,cpuPointArray[i].b); 

         printf("cuda mem location point.c: %d point.d: %d\n",&cpuPointArray[i].c, &cpuPointArray[i].d); 

       } 

      // launch kernel 
     testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray); 

     printf("returned the struct from the kernel\n"); 
     err = cudaMemcpy(outPointArray,gpuPointArray,numBytes, cudaMemcpyDeviceToHost); 
     checkerror(err, "copying gpuPointArray to cpuPointArray"); 
     printf("after gpu copy to cpu\n"); 
     for (int k=0; k<16; k++){ 
      printf("creating memory on cpu for array c\n"); 
      outPointArray[k].c = (float*)malloc(16*sizeof(float)); 
      printf("creating memory on cpu for array d\n"); 
      outPointArray[k].d = (float*)malloc(16*sizeof(float)); 
      printf("copying memory values onto cpu array c\n"); 
      err = cudaMemcpy(outPointArray[k].c, cpuPointArray[k].c, 16*sizeof(float), cudaMemcpyDeviceToHost); 
      checkerror(err, "copy array c from gpu to cpu"); 
      printf("copying memory values onto cpu array c\n"); 
      err = cudaMemcpy(outPointArray[k].d, cpuPointArray[k].d, 16*sizeof(float), cudaMemcpyDeviceToHost); 
      checkerror(err, "copy array d from gpu to cpu"); 
      printf("bottom of loop %d\n", k); 
     } 

      // retrieve the results 

     printf("testKernel results:\n"); 
     for(int i = 0; i < numPoints; ++i) 
     { 
      printf("point.a: %f, point.b: %f ************************\n",outPointArray[i].a,outPointArray[i].b); 
      for (int j=0; j<16; j++){ 
       printf("point.c: %f point.d: %f\n",outPointArray[i].c[j], outPointArray[i].d[j]); 
      } 
     } 

      // deallocate memory 
     free(cpuPointArray); 
     cudaFree(gpuPointArray); 

     return 0; 
    } 

回答

1

好像你可能會複製你的結構的數組到設備不正確。嘗試改變:

err = cudaMemcpy(gpuPointArray,cpuPointArray,sizeof(cpuPointArray), cudaMemcpyHostToDevice); 

err = cudaMemcpy(gpuPointArray,cpuPointArray,numBytes, cudaMemcpyHostToDevice); 

因爲cpuPointArray有型點*的sizeof(cpuPointArray)實際上將你的機器上返回一個指針的大小。你想要的是整個結構體系的大小。事實上,它甚至看起來像你從設備上覆制回來時所做的那樣:

err = cudaMemcpy(outPointArray,gpuPointArray,numBytes, cudaMemcpyDeviceToHost); 

希望有幫助!

+0

謝謝!這工作! – napl