-1
我是CUDA/GPU的新手,我在將數據從設備複製回主機時遇到問題。我正在開發帶有CUDA Toolkit 6.5的Jetson TK1。它成功構建,但在運行時發生錯誤。我的代碼如下:從設備到主機的cudaMemcpy中的無效參數錯誤
//main.cu
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size);
int main() {
int data_length = 1024000;
const int length=512;
const size_t size= length;
double signalA[length], signalB[length], signalC[length];
for (int i=0; i<data_length; i++)
{
double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
if(i==0)
{
for(int k=0; k<length; k++)
{
signalA[k]=v_ia[k];
signalB[k]=v_ib[k];
signalC[k]=v_ic[k];
}
i=length-1;
}
else
{
//allocate memory in GPU and kernel call for phase A
allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalA[length-1]=v_ia[i];
//allocate memory in GPU and kernel call for phase B
allocate(d_inputCurrentIb, signalB, d_outputCurrentIb, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalB, d_outputCurrentIb, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalB[length-1]=v_ib[i];
//allocate memory in GPU and kernel call for phase C;
allocate(d_inputCurrentIc, signalC, d_outputCurrentIc, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(signalC, d_outputCurrentIc, sizeof(double) * size, cudaMemcpyDeviceToHost));
signalC[length-1]=v_ic[i];
//memory cleaning
checkCudaErrors(cudaFree(d_inputCurrentIa));
checkCudaErrors(cudaFree(d_inputCurrentIb));
checkCudaErrors(cudaFree(d_inputCurrentIc));
checkCudaErrors(cudaFree(d_outputCurrentIa));
checkCudaErrors(cudaFree(d_outputCurrentIb));
checkCudaErrors(cudaFree(d_outputCurrentIc));
}
而且我的內核和功能都很簡單,他們只是移動數組元素左邊每次:
__global__ void allocate_kernel(double* const d_in, double* const d_out, const size_t size) {
__shared__ double shared[512];
int tid = threadIdx.x;
if(tid < size)
shared[tid] = d_in[tid];
__syncthreads();
if(tid < size-1)
d_out[tid]=shared[tid+1];
__syncthreads();
}
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size) {
const dim3 blockSize(512);
const dim3 gridSize(1);
checkCudaErrors(cudaFree(0));
checkCudaErrors(cudaMalloc((void **)&d_inputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMalloc((void **)&d_outputCurrent, sizeof(double) * size));
checkCudaErrors(cudaMemset(d_outputCurrent, 0, sizeof(double) * size));
checkCudaErrors(cudaMemcpy(d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
allocate_kernel<<<gridSize, blockSize>>>(d_inputCurrent, d_outputCurrent, size);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
這是我的博士論文的一小部分,我正在用這段代碼練習CUDA,我知道它現在並不那麼有意義,但我無法進一步行動,因爲我對這個問題非常困惑。任何幫助將不勝感激,在此先感謝。
感謝您快速回復@Robert。顯然,我對編碼(特別是_pointers_ :))並不熟悉,只是想在很短的時間內找出一些GPU編程。我遵循你的步驟,它的工作,但這次我在運行時遇到「總線錯誤」。你所說的一切都是正確的,所以我刪除了'const',在主函數中分配了所有內容,並在最後釋放了指針。現在它工作得很好。如果有人感興趣,我可以發佈改進的答案。 – schloxy