2012-12-09 48 views
0

我有作業要做異構並行編程。代碼由教學人員編寫,我們的職責是填寫//@@標記的區域。代碼應該使用CUDA C添加兩個向量。我已經嘗試了下面的解決方案,儘管程序執行時沒有錯誤,但反饋意味着代碼的輸出與預期結果不匹配。下面是代碼後,我加了什麼,我相信需要:使用CUDA C添加兩個向量,啓動內核

// MP 1 
#include <wb.h> 

__global__ void vecAdd(float* in1, float* in2, float* out, int len) { 
//@@ Insert code to implement vector addition here 
int i = threadIdx.x + blockDim.x * blockIdx.x; 
if (i < len) out[i] = in1[i] + in2[i]; 
} 



int main(int argc, char ** argv) { 
wbArg_t args; 
int inputLength; 
float * hostInput1; 
float * hostInput2; 
float * hostOutput; 
float * deviceInput1; 
float * deviceInput2; 
float * deviceOutput; 
//@@ i added ###### 
int size = inputLength*sizeof(float); 
//@@ ######## 
args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float)); 

wbTime_stop(Generic, "Importing data and creating memory on host"); 

wbLog(TRACE, "The input length is ", inputLength); 

wbTime_start(GPU, "Allocating GPU memory."); 
//@@ Allocate GPU memory here 

cudaMalloc((void**)&deviceInput1 , size); 
cudaMalloc((void**)&deviceInput2 , size); 
cudaMalloc((void**)&deviceOutput , size); 
wbTime_stop(GPU, "Allocating GPU memory."); 

wbTime_start(GPU, "Copying input memory to the GPU."); 
//@@ Copy memory to the GPU here 
cudaMemcpy(deviceInput1, hostInput1, size, cudaMemcpyHostToDevice); 
cudaMemcpy(deviceInput2, hostInput2, size, cudaMemcpyHostToDevice); 
wbTime_stop(GPU, "Copying input memory to the GPU."); 

//@@ Initialize the grid and block dimensions here 
dim3 DimGrid((inputLength -1)/256 +1 , 1 , 1); 
dim3 DimBlock(256 , 1, 1); 

wbTime_start(Compute, "Performing CUDA computation"); 
//@@ Launch the GPU Kernel here  
vecAdd<<<DimGrid , DimBlock>>>(deviceInput1 , deviceInput2 , deviceOutput , inputLength); 
cudaThreadSynchronize(); 
wbTime_stop(Compute, "Performing CUDA computation"); 

wbTime_start(Copy, "Copying output memory to the CPU"); 
//@@ Copy the GPU memory back to the CPU here 
cudaMemcpy(hostOutput, deviceOutput, size , cudaMemcpyDeviceToHost); 
wbTime_stop(Copy, "Copying output memory to the CPU"); 

wbTime_start(GPU, "Freeing GPU Memory"); 
//@@ Free the GPU memory here 
free(deviceInput1); 
free(deviceInput2); 
free(deviceOutput); 

wbTime_stop(GPU, "Freeing GPU Memory"); 

wbSolution(args, hostOutput, inputLength); 

free(hostInput1); 
free(hostInput2); 
free(hostOutput); 

return 0; 
} 
+0

難道是coursera功課? :) – ahmad

+0

是的,它是和截止日期是今晚 –

+3

使用'cudaFree()'爲GPU指針,而不是'免費()' – talonmies

回答

2

向下移動你的代碼,其中inputLength變量已經得到了應有的價值。此外

args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float)); 

//@@ i added ###### 
int size = inputLength*sizeof(float); 
//@@ ######## 

,做什麼建議在評論talonmies:更改此:

//@@ i added ###### 
int size = inputLength*sizeof(float); 
//@@ ######## 
args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float)); 

了這一點。

+0

謝謝你ahmad。其實我有兩個錯誤,第一個是talonmies提到我必須把cudaFree而不是免費的,另一個是我在你的答案中提到的int大小的位置,我認爲有一種表達我的感謝的方式是爲您的答案投票。 –

+0

@ mazen.r.f享受雜項課程並與CUDA玩得開心:) – ahmad

+0

@ mazen.r.f請接受答案。 –

1

感謝你們talonmies和艾哈邁德它們都有助於獲得其工作對我來說正確的答案,以及完整的答案(對於誰是有趣的)是以下幾點:

// MP 1 
#include <wb.h> 

__global__ void vecAdd(float* in1, float* in2, float* out, int len) { 
int i = threadIdx.x + blockDim.x * blockIdx.x; 
if (i < len) out[i] = in1[i] + in2[i]; 
} 



int main(int argc, char ** argv) { 
wbArg_t args; 
int inputLength; 
float * hostInput1; 
float * hostInput2; 
float * hostOutput; 
float * deviceInput1; 
float * deviceInput2; 
float * deviceOutput; 


args = wbArg_read(argc, argv); 

wbTime_start(Generic, "Importing data and creating memory on host"); 
hostInput1 = (float *) wbImport(wbArg_getInputFile(args, 0), &inputLength); 
hostInput2 = (float *) wbImport(wbArg_getInputFile(args, 1), &inputLength); 
hostOutput = (float *) malloc(inputLength * sizeof(float)); 
int size = inputLength*sizeof(float); 

wbTime_stop(Generic, "Importing data and creating memory on host"); 

wbLog(TRACE, "The input length is ", inputLength); 

wbTime_start(GPU, "Allocating GPU memory."); 


cudaMalloc((void**)&deviceInput1 , size); 
cudaMalloc((void**)&deviceInput2 , size); 
cudaMalloc((void**)&deviceOutput , size); 
wbTime_stop(GPU, "Allocating GPU memory."); 

wbTime_start(GPU, "Copying input memory to the GPU."); 

cudaMemcpy(deviceInput1, hostInput1, size, cudaMemcpyHostToDevice); 
cudaMemcpy(deviceInput2, hostInput2, size, cudaMemcpyHostToDevice); 
wbTime_stop(GPU, "Copying input memory to the GPU."); 


dim3 DimGrid((inputLength -1)/256 +1 , 1 , 1); 
dim3 DimBlock(256 , 1, 1); 

wbTime_start(Compute, "Performing CUDA computation"); 
//@@ Launch the GPU Kernel 

vecAdd<<<DimGrid , DimBlock>>>(deviceInput1 , deviceInput2 , deviceOutput , inputLength); 

cudaThreadSynchronize(); 
wbTime_stop(Compute, "Performing CUDA computation"); 

wbTime_start(Copy, "Copying output memory to the CPU"); 
//@@ Copy the GPU memory back to the CPU 
cudaMemcpy(hostOutput, deviceOutput, size , cudaMemcpyDeviceToHost); 
wbTime_stop(Copy, "Copying output memory to the CPU"); 

wbTime_start(GPU, "Freeing GPU Memory"); 
//@@ Free the GPU memory 
cudaFree(deviceInput1); 
cudaFree(deviceInput2); 
cudaFree(deviceOutput); 

wbTime_stop(GPU, "Freeing GPU Memory"); 

wbSolution(args, hostOutput, inputLength); 

free(hostInput1); 
free(hostInput2); 
free(hostOutput); 

return 0; 
}