使用CUDA C添加兩個向量，啓動內核

我有作業要做異構並行編程。代碼由教學人員編寫，我們的職責是填寫//@@標記的區域。代碼應該使用CUDA C添加兩個向量。我已經嘗試了下面的解決方案，儘管程序執行時沒有錯誤，但反饋意味着代碼的輸出與預期結果不匹配。下面是代碼後，我加了什麼，我相信需要：使用CUDA C添加兩個向量，啓動內核

// MP 1 
#include <wb.h> 

__global__ void vecAdd(float* in1, float* in2, float* out, int len) { 
//@@ Insert code to implement vector addition here 
int i = threadIdx.x + blockDim.x * blockIdx.x; 
if (i < len) out[i] = in1[i] + in2[i]; 
} 



int main(int argc, char ** argv) { 
wbArg_t args; 
int inputLength; 
float * hostInput1; 
float * hostInput2; 
float * hostOutput; 
float * deviceInput1; 
float * deviceInput2; 
float * deviceOutput; 
//@@ i added ###### 
int size = inputLength*sizeof(float); 
//@@ ######## 
args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float)); 

wbTime_stop(Generic, "Importing data and creating memory on host"); 

wbLog(TRACE, "The input length is ", inputLength); 

wbTime_start(GPU, "Allocating GPU memory."); 
//@@ Allocate GPU memory here 

cudaMalloc((void**)&deviceInput1 , size); 
cudaMalloc((void**)&deviceInput2 , size); 
cudaMalloc((void**)&deviceOutput , size); 
wbTime_stop(GPU, "Allocating GPU memory."); 

wbTime_start(GPU, "Copying input memory to the GPU."); 
//@@ Copy memory to the GPU here 
cudaMemcpy(deviceInput1, hostInput1, size, cudaMemcpyHostToDevice); 
cudaMemcpy(deviceInput2, hostInput2, size, cudaMemcpyHostToDevice); 
wbTime_stop(GPU, "Copying input memory to the GPU."); 

//@@ Initialize the grid and block dimensions here 
dim3 DimGrid((inputLength -1)/256 +1 , 1 , 1); 
dim3 DimBlock(256 , 1, 1); 

wbTime_start(Compute, "Performing CUDA computation"); 
//@@ Launch the GPU Kernel here  
vecAdd<<<DimGrid , DimBlock>>>(deviceInput1 , deviceInput2 , deviceOutput , inputLength); 
cudaThreadSynchronize(); 
wbTime_stop(Compute, "Performing CUDA computation"); 

wbTime_start(Copy, "Copying output memory to the CPU"); 
//@@ Copy the GPU memory back to the CPU here 
cudaMemcpy(hostOutput, deviceOutput, size , cudaMemcpyDeviceToHost); 
wbTime_stop(Copy, "Copying output memory to the CPU"); 

wbTime_start(GPU, "Freeing GPU Memory"); 
//@@ Free the GPU memory here 
free(deviceInput1); 
free(deviceInput2); 
free(deviceOutput); 

wbTime_stop(GPU, "Freeing GPU Memory"); 

wbSolution(args, hostOutput, inputLength); 

free(hostInput1); 
free(hostInput2); 
free(hostOutput); 

return 0; 
}

來源

2012-12-09 mzn.rft

難道是coursera功課？ :) – ahmad

是的，它是和截止日期是今晚 –

使用'cudaFree（）'爲GPU指針，而不是'免費（）' – talonmies

向下移動你的代碼，其中inputLength變量已經得到了應有的價值。此外

args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float)); 

//@@ i added ###### 
int size = inputLength*sizeof(float); 
//@@ ########

，做什麼建議在評論talonmies：更改此：

//@@ i added ###### 
int size = inputLength*sizeof(float); 
//@@ ######## 
args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float));

了這一點。

來源

2012-12-09 17:22:55 ahmad

謝謝你ahmad。其實我有兩個錯誤，第一個是talonmies提到我必須把cudaFree而不是免費的，另一個是我在你的答案中提到的int大小的位置，我認爲有一種表達我的感謝的方式是爲您的答案投票。 –

@ mazen.r.f享受雜項課程並與CUDA玩得開心:) – ahmad

@ mazen.r.f請接受答案。 –

感謝你們talonmies和艾哈邁德它們都有助於獲得其工作對我來說正確的答案，以及完整的答案（對於誰是有趣的）是以下幾點：

// MP 1 
#include <wb.h> 

__global__ void vecAdd(float* in1, float* in2, float* out, int len) { 
int i = threadIdx.x + blockDim.x * blockIdx.x; 
if (i < len) out[i] = in1[i] + in2[i]; 
} 



int main(int argc, char ** argv) { 
wbArg_t args; 
int inputLength; 
float * hostInput1; 
float * hostInput2; 
float * hostOutput; 
float * deviceInput1; 
float * deviceInput2; 
float * deviceOutput; 


args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float)); 
int size = inputLength*sizeof(float); 

wbTime_stop(Generic, "Importing data and creating memory on host"); 

wbLog(TRACE, "The input length is ", inputLength); 

wbTime_start(GPU, "Allocating GPU memory."); 


cudaMalloc((void**)&deviceInput1 , size); 
cudaMalloc((void**)&deviceInput2 , size); 
cudaMalloc((void**)&deviceOutput , size); 
wbTime_stop(GPU, "Allocating GPU memory."); 

wbTime_start(GPU, "Copying input memory to the GPU."); 

cudaMemcpy(deviceInput1, hostInput1, size, cudaMemcpyHostToDevice); 
cudaMemcpy(deviceInput2, hostInput2, size, cudaMemcpyHostToDevice); 
wbTime_stop(GPU, "Copying input memory to the GPU."); 


dim3 DimGrid((inputLength -1)/256 +1 , 1 , 1); 
dim3 DimBlock(256 , 1, 1); 

wbTime_start(Compute, "Performing CUDA computation"); 
//@@ Launch the GPU Kernel 

vecAdd<<<DimGrid , DimBlock>>>(deviceInput1 , deviceInput2 , deviceOutput , inputLength); 

cudaThreadSynchronize(); 
wbTime_stop(Compute, "Performing CUDA computation"); 

wbTime_start(Copy, "Copying output memory to the CPU"); 
//@@ Copy the GPU memory back to the CPU 
cudaMemcpy(hostOutput, deviceOutput, size , cudaMemcpyDeviceToHost); 
wbTime_stop(Copy, "Copying output memory to the CPU"); 

wbTime_start(GPU, "Freeing GPU Memory"); 
//@@ Free the GPU memory 
cudaFree(deviceInput1); 
cudaFree(deviceInput2); 
cudaFree(deviceOutput); 

wbTime_stop(GPU, "Freeing GPU Memory"); 

wbSolution(args, hostOutput, inputLength); 

free(hostInput1); 
free(hostInput2); 
free(hostOutput); 

return 0; 
}

來源

2012-12-12 18:38:30

使用CUDA C添加兩個向量，啓動內核

回答

相關問題