使用流的CUDA中的矢量添加

-2

我是CUDA並行編程的初級用戶。我嘗試了使用CUDA流添加向量的程序。當我編譯時，我得到以下錯誤。使用流的CUDA中的矢量添加

解決方案不正確。該解決方案不匹配爲0行預期的結果期待（1 + 0.5 = 1.5），但得到0

我與實例的書，類似的問題CUDA在網上查了。找不到解決方案。任何人都可以幫我解決這個錯誤嗎？提前致謝。

#include <wb.h> 
#define wbCheck(stmt) do {      \ 
    cudaError_t err = stmt;      \ 
    if (err != cudaSuccess) {     \ 
     wbLog(ERROR, "Failed to run stmt ", #stmt);   \ 
     wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 
     return -1;       \ 
    }        \ 
} while(0) 


__global__ void vecAdd(float * in1, float * in2, float * out, int len) 
{ 
int i = threadIdx.x + blockDim.x * blockIdx.x; 
    if (i < len) 
out[i] = in1[i] + in2[i]; 
} 

int main(int argc, char ** argv) 
{ 
cudaStream_t stream0, stream1,stream2,stream3; 
cudaStreamCreate(&stream0); 
cudaStreamCreate(&stream1); 
cudaStreamCreate(&stream2); 
cudaStreamCreate(&stream3); 
wbArg_t args; 
int inputLength; 

float *h_A, *h_B, *h_C; 
float *d_A0, *d_B0, *d_C0; 
float *d_A1, *d_B1, *d_C1; 
float *d_A2, *d_B2, *d_C2; 
float *d_A3, *d_B3, *d_C3; 

args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
h_C = (float *) malloc(inputLength * sizeof(float)); 
wbTime_stop(Generic, "Importing data and creating memory on host"); 
wbLog(TRACE, "The input length is ", inputLength); 
wbLog(TRACE, "h_A ", *h_A); 
wbLog(TRACE, "h_B", *h_B); 


int size = inputLength * sizeof(float); 
int SegSize = inputLength/4; 


wbCheck(cudaMalloc((void **) &d_A0, size)); 
wbCheck(cudaMalloc((void **) &d_B0, size)); 
wbCheck(cudaMalloc((void **) &d_C0, size)); 

wbCheck(cudaMalloc((void **) &d_A1, size)); 
wbCheck(cudaMalloc((void **) &d_B1, size)); 
wbCheck(cudaMalloc((void **) &d_C1, size)); 

wbCheck(cudaMalloc((void **) &d_A2, size)); 
wbCheck(cudaMalloc((void **) &d_B2, size)); 
wbCheck(cudaMalloc((void **) &d_C2, size)); 

wbCheck(cudaMalloc((void **) &d_A3, size)); 
wbCheck(cudaMalloc((void **) &d_B3, size)); 
wbCheck(cudaMalloc((void **) &d_C3, size)); 


cudaHostAlloc((void **) &h_A, size, cudaHostAllocDefault); 
cudaHostAlloc((void **) &h_B, size, cudaHostAllocDefault); 
cudaHostAlloc((void **) &h_C, size, cudaHostAllocDefault); 


dim3 DimGrid((inputLength -1)/256 +1 , 1 , 1); 
dim3 DimBlock(256 , 1, 1); 


for (int i=0; i<size; i+=inputLength*4) 
{ 
cudaMemcpyAsync(d_A0, h_A+i, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream0); 
cudaMemcpyAsync(d_B0, h_B+i, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream0); 

cudaMemcpyAsync(d_A1, h_A+i+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1); 
cudaMemcpyAsync(d_B1, h_B+i+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice,stream1); 

cudaMemcpyAsync(d_A2, h_A+i+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream2); 
cudaMemcpyAsync(d_B2, h_B+i+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream2); 

cudaMemcpyAsync(d_A3, h_A+i+SegSize+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream3); 
cudaMemcpyAsync(d_B3, h_B+i+SegSize+SegSize+SegSize, SegSize*sizeof(float),cudaMemcpyHostToDevice, stream3); 

vecAdd<<<DimGrid, DimBlock, 0, stream0>>>(d_A0, d_B0, d_C0,inputLength); 
vecAdd<<<DimGrid, DimBlock, 0, stream1>>>(d_A1, d_B1, d_C1,inputLength); 
vecAdd<<<DimGrid, DimBlock, 0, stream2>>>(d_A2, d_B2, d_C2,inputLength); 
vecAdd<<<DimGrid, DimBlock, 0, stream3>>>(d_A3, d_B3, d_C3,inputLength); 


cudaDeviceSynchronize(); 


cudaMemcpyAsync(h_C+i, d_C0, SegSize*sizeof(float),cudaMemcpyDeviceToHost, stream0); 
cudaMemcpyAsync(h_C+i+SegSize, d_C1, SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream1); 
cudaMemcpyAsync(h_C+i+SegSize+SegSize, d_C2, SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream2); 
cudaMemcpyAsync(h_C+i+SegSize+SegSize+SegSize, d_C3, SegSize*sizeof(float),cudaMemcpyDeviceToHost,stream3); 
    wbLog(TRACE, "on addition is ", *h_C); 

} 

cudaFree(d_A0); 
cudaFree(d_B0); 
cudaFree(d_C0); 

cudaFree(d_A1); 
cudaFree(d_B1); 
cudaFree(d_C1); 

cudaFree(d_A2); 
cudaFree(d_B2); 
cudaFree(d_C2); 

cudaFree(d_A3); 
cudaFree(d_B3); 
cudaFree(d_C3); 

wbSolution(args, h_C, inputLength); 
cudaFreeHost(h_A); 
cudaFreeHost(h_B); 
cudaFreeHost(h_C); 

return 0; 
}

來源

2014-03-01 Kaushik Velusamy

我也在coursera課程中嘗試相同的問題。你沒有檢查for循環中的邊界條件嗎？如果inputLength不是segmentSize的倍數呢？ –

的一個問題是，你是如何處理h_A，h_B和h_C：

h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength);

上面的代碼行是爲h_A和h_B創造的分配和導入一些數據（可能）。

這些代碼行：

cudaHostAlloc((void **) &h_A, size, cudaHostAllocDefault); 
cudaHostAlloc((void **) &h_B, size, cudaHostAllocDefault); 
cudaHostAlloc((void **) &h_C, size, cudaHostAllocDefault);

是不是做你的想法。他們正在爲h_A,h_B和h_C創建新的分配。不管這些指針先前引用的數據是否不再可以從這些指針訪問（即，對於所有意圖和目的，它都會丟失）。

CUDA應該可以工作的很好的指針和分配在這裏被創造：

h_A = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
h_B = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
h_C = (float *) malloc(inputLength * sizeof(float));

因此刪除這些代碼：

cudaHostAlloc((void **) &h_A, size, cudaHostAllocDefault); 
cudaHostAlloc((void **) &h_B, size, cudaHostAllocDefault); 
cudaHostAlloc((void **) &h_C, size, cudaHostAllocDefault);

和刪除這些：

cudaFreeHost(h_A); 
cudaFreeHost(h_B); 
cudaFreeHost(h_C);

你應該更接近解決方案。

來源

2014-03-01 23:00:56

謝謝Robert Crovella。您的解決方案奏效 –

我可能應該指出，這將打破異步複製功能，使事物同步。您將需要'cudaHostAlloc'內存分配來啓用異步複製。在這種情況下，您應該創建新的指針「h_Ap」，「h_Bp」，「h_Cp」等，並將數據複製到這些分配中。在異步副本中使用這些指針。在使用任何已被複制回主機的結果，即使用異步拷貝返回到「h_Cp」之前，你需要一個類似'cudaDeviceSynchronize（）'的同步步驟。 –

使用流的CUDA中的矢量添加

回答

相關問題