2014-03-02 69 views
-2

我已經看到這個問題vector addition in CUDA using streams但這不是我的代碼的問題。儘管我遇到了同樣的錯誤,但根本原因卻不同。當我編譯時,我得到以下錯誤。向量添加流

解決方法不正確。該解決方案與第0行的預期結果不匹配。期望(1 + 0.5 = 1.5)但得到0.

我試圖在內核中打印值並發現計算是正確的。但是,當我從設備複製到主機時,我看到所有正在打印的零。

#include<wb.h> 

#define wbCheck(stmt) do {             \ 
     cudaError_t err = stmt;            \ 
     if (err != cudaSuccess) {            \ 
      wbLog(ERROR, "Failed to run stmt ", #stmt);      \ 
      wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 
      return -1;              \ 
     }                  \ 
    } while(0) 

#define NUM_STREAMS 2 

__global__ void vecAdd(float * in1, float * in2, float * out, int len) { 
    //@@ Insert code to implement vector addition here 
    int i = blockIdx.x*blockDim.x + threadIdx.x; 

    if(i< len) 
    { 

     out[i]= in1[i]+in2[i]; 
     printf("Thread %d %f %f out %f\n",i,in1[i],in2[i],out[i]); 
    } 
} 

int main(int argc, char ** argv) { 
    wbArg_t args; 
    int inputLength; 
    float * hostInput1; 
    float * hostInput2; 
    float * hostOutput; 
    float * deviceInput1; 
    float * deviceInput2; 
    float * deviceOutput; 

    args = wbArg_read(argc, argv); 

    wbTime_start(Generic, "Importing data and creating memory on host"); 
    hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
    hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
    hostOutput = (float *) malloc(inputLength * sizeof(float)); 
    wbTime_stop(Generic, "Importing data and creating memory on host"); 

    float *h_A, *h_B, *h_C; 
    float *d_A0, *d_B0, *d_C0; //Device memory for stream0 
    float *d_A1, *d_B1, *d_C1; //Device memory for stream1 

    cudaHostAlloc((void**)&h_A, inputLength*sizeof(float), cudaHostAllocDefault); 
    cudaHostAlloc((void**)&h_B, inputLength*sizeof(float), cudaHostAllocDefault); 
    cudaHostAlloc((void**)&h_C, inputLength*sizeof(float), cudaHostAllocDefault); 

    memcpy(h_A, hostInput1,inputLength*sizeof(float)); 
    memcpy(h_B, hostInput2,inputLength*sizeof(float)); 
    printf("%f %f\n", h_A[0],hostInput1[0]); 
    printf("%f %f \n",h_A[1],hostInput1[1]); 

    printf("Input length is %d\n", inputLength); 


    int nstreams = NUM_STREAMS; 
    cudaStream_t *streams = (cudaStream_t*) malloc(nstreams * sizeof(cudaStream_t)); 
    for(int i = 0; i < nstreams; i++) 
     cudaStreamCreate(&(streams[i])); 


    long segSize = 1024; 

    wbCheck(cudaMalloc((void **)&d_A0, segSize*sizeof(float))); 
    wbCheck(cudaMalloc((void **)&d_A1, segSize*sizeof(float))); 
    wbCheck(cudaMalloc((void **)&d_B0, segSize*sizeof(float))); 
    wbCheck(cudaMalloc((void **)&d_B1, segSize*sizeof(float))); 
    wbCheck(cudaMalloc((void **)&d_C0, segSize*sizeof(float))); 
    wbCheck(cudaMalloc((void **)&d_C1, segSize*sizeof(float))); 


    for(int i=0; i< inputLength; i+=segSize*2) 
    { 

     if(i+segSize <= inputLength) 
     { 
      cudaMemcpyAsync(d_A0,h_A+i,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[0]); 
      cudaMemcpyAsync(d_B0,h_B+i,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[0]); 

      if(i+2*segSize <= inputLength) 
      { 
       cudaMemcpyAsync(d_A1,h_A+i+segSize,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[1]); 
       cudaMemcpyAsync(d_B1,h_B+i+segSize,segSize*sizeof(float),cudaMemcpyHostToDevice,streams[1]); 
      } 
      else 
      { 
       cudaMemcpyAsync(d_A1,h_A+i+segSize,(inputLength-i-segSize)*sizeof(float),cudaMemcpyHostToDevice,streams[1]); 
       cudaMemcpyAsync(d_B1,h_B+i+segSize,(inputLength-i-segSize)*sizeof(float),cudaMemcpyHostToDevice,streams[1]); 

      } 
     } 
     else 
     { 
      cudaMemcpyAsync(d_A0,h_A+i,(inputLength-i)*sizeof(float),cudaMemcpyHostToDevice,streams[0]); 
      cudaMemcpyAsync(d_B0,h_B+i,(inputLength-i)*sizeof(float),cudaMemcpyHostToDevice,streams[0]); 
     } 


     if(i+segSize <= inputLength) 
     { 

      vecAdd<<<segSize/256, 256, 1, streams[0]>>>(d_A0,d_B0,d_C0, segSize); 
      if(i+2*segSize <= inputLength) 
      { 
       vecAdd<<<segSize/256, 256, 1, streams[1]>>>(d_A1,d_B1,d_C1, segSize); 
      } 
      else 
      { 
       vecAdd<<<segSize/256, 256, 1, streams[1]>>>(d_A1,d_B1,d_C1, inputLength-i-segSize); 
      } 

     } 
     else 
     { 
      vecAdd<<<segSize/256, 256, 1, streams[0]>>>(d_A0,d_B0,d_C0, inputLength-i); 
     } 


     if(i+segSize <= inputLength) 
     { 
      cudaMemcpyAsync(h_C+i,d_C0,segSize*sizeof(float),cudaMemcpyDeviceToHost,streams[0]); 

      if(i+2*segSize <= inputLength) 
      { 
            cudaMemcpyAsync(h_C+i+segSize,d_C1,segSize*sizeof(float),cudaMemcpyDeviceToHost,streams[1]); 
       printf("hello %f\n", h_C[0]); 
      } 
      else 
      { 
       cudaMemcpyAsync(h_C+i+segSize,d_C1,(inputLength-i-segSize)*sizeof(float),cudaMemcpyDeviceToHost,streams[1]); 
      } 
     } 
     else 
     { 
      cudaMemcpyAsync(h_C+i,d_C0,(inputLength-i)*sizeof(float),cudaMemcpyDeviceToHost,streams[0]); 
     } 
    } 

    memcpy(hostOutput, h_C, inputLength*sizeof(float)); 

    wbSolution(args, hostOutput, inputLength); //hostOutput and h_C contains all zeroes 

    free(hostInput1); 
    free(hostInput2); 
    free(hostOutput); 

    cudaFree(d_A0); 
    cudaFree(d_A1); 
    cudaFree(d_B0); 
    cudaFree(d_B1); 
    cudaFree(d_C0); 
    cudaFree(d_C1); 

    return 0; 
} 
+4

嘗試在調用memcpy(hostOutput,h_C,inputLength * sizeof(float));'之前添加'cudaDeviceSynchronize();'。由於'cudaMemcpyAsync'調用不會阻止主機,因此您無法保證所有數據都從設備複製到'h_C'。 – hubs

+0

「但是,當我從設備複製到主機時,我看到所有正在打印的零」。你的代碼總是打印出'h_C'的相同元素。爲什麼要打印出不同的東西? – talonmies

+0

我不知道使用cudaDeviceSynchronize();並按照建議它的工作。 –

回答

1

由於@hubs在他的評論中建議,我應該使用cudaDeviceSynchronize();在memcpy之前,建議起作用了。