2015-11-30 68 views
-1

我是CUDA/GPU的新手,我在將數據從設備複製回主機時遇到問題。我正在開發帶有CUDA Toolkit 6.5的Jetson TK1。它成功構建,但在運行時發生錯誤。我的代碼如下:從設備到主機的cudaMemcpy中的無效參數錯誤

//main.cu 
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size); 

int main() { 
    int data_length = 1024000; 
    const int length=512; 
    const size_t size= length; 

    double signalA[length], signalB[length], signalC[length]; 

for (int i=0; i<data_length; i++) 
{ 

    double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc; 
    double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc; 

    if(i==0) 
    { 
     for(int k=0; k<length; k++) 
     { 
      signalA[k]=v_ia[k]; 
      signalB[k]=v_ib[k]; 
      signalC[k]=v_ic[k]; 
     } 
     i=length-1; 
    } 
    else 
    { 
     //allocate memory in GPU and kernel call for phase A 
     allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size); 
     cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 

     checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost)); 
     signalA[length-1]=v_ia[i]; 

     //allocate memory in GPU and kernel call for phase B 
     allocate(d_inputCurrentIb, signalB, d_outputCurrentIb, size); 
     cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 

     checkCudaErrors(cudaMemcpy(signalB, d_outputCurrentIb, sizeof(double) * size, cudaMemcpyDeviceToHost)); 
     signalB[length-1]=v_ib[i]; 

     //allocate memory in GPU and kernel call for phase C; 
     allocate(d_inputCurrentIc, signalC, d_outputCurrentIc, size); 
     cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 

     checkCudaErrors(cudaMemcpy(signalC, d_outputCurrentIc, sizeof(double) * size, cudaMemcpyDeviceToHost)); 
     signalC[length-1]=v_ic[i]; 

     //memory cleaning 
     checkCudaErrors(cudaFree(d_inputCurrentIa)); 
     checkCudaErrors(cudaFree(d_inputCurrentIb)); 
     checkCudaErrors(cudaFree(d_inputCurrentIc)); 
     checkCudaErrors(cudaFree(d_outputCurrentIa)); 
     checkCudaErrors(cudaFree(d_outputCurrentIb)); 
     checkCudaErrors(cudaFree(d_outputCurrentIc)); 
    } 

而且我的內核和功能都很簡單,他們只是移動數組元素左邊每次:

__global__ void allocate_kernel(double* const d_in, double* const d_out, const size_t size) { 

    __shared__ double shared[512]; 

    int tid = threadIdx.x; 

    if(tid < size) 
    shared[tid] = d_in[tid]; 
    __syncthreads(); 

    if(tid < size-1) 
    d_out[tid]=shared[tid+1]; 
    __syncthreads(); 

} 


void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size) { 

    const dim3 blockSize(512); 
    const dim3 gridSize(1); 

    checkCudaErrors(cudaFree(0)); 

    checkCudaErrors(cudaMalloc((void **)&d_inputCurrent, sizeof(double) * size)); 
    checkCudaErrors(cudaMalloc((void **)&d_outputCurrent, sizeof(double) * size)); 

    checkCudaErrors(cudaMemset(d_outputCurrent, 0, sizeof(double) * size)); 

    checkCudaErrors(cudaMemcpy(d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice)); 

    allocate_kernel<<<gridSize, blockSize>>>(d_inputCurrent, d_outputCurrent, size); 
    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 
} 

這是我的博士論文的一小部分,我正在用這段代碼練習CUDA,我知道它現在並不那麼有意義,但我無法進一步行動,因爲我對這個問題非常困惑。任何幫助將不勝感激,在此先感謝。

回答

1

在C中,你不能一個指針傳遞給由值的函數,有一個功能修改指針,然後期望該指針的修改在調用環境展現出來:

double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc; 
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc; 

... 
    //allocate memory in GPU and kernel call for phase A 

// at this point, d_inputCurrentIa and d_outputCurrentIa are pointing to nothing 
    allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size); 
// allocate modified those pointers internally, but the modified values don't show up here 
    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 

    checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost)); 
// therefore you will get an error here, because d_outputCurrentIa still points to nothing 

有很多方法可以完成這項工作。一種方法是通過你要修改的指針和使用的地址

void allocate(double** d_inputCurrent, double* signal, double **d_outputCurrent, const size_t size); 

double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc; 
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc; 

... 
    //allocate memory in GPU and kernel call for phase A 
    allocate(&d_inputCurrentIa, signalA, &d_outputCurrentIa, size); 
... 
void allocate(double** d_inputCurrent, double* signal, double** d_outputCurrent, const size_t size) { 

    const dim3 blockSize(512); 
    const dim3 gridSize(1); 

    checkCudaErrors(cudaFree(0)); 

    checkCudaErrors(cudaMalloc((void **)d_inputCurrent, sizeof(double) * size)); 
    checkCudaErrors(cudaMalloc((void **)d_outputCurrent, sizeof(double) * size)); 

    checkCudaErrors(cudaMemset(*d_outputCurrent, 0, sizeof(double) * size)); 

    checkCudaErrors(cudaMemcpy(*d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice)); 

    allocate_kernel<<<gridSize, blockSize>>>(*d_inputCurrent, *d_outputCurrent, size); 
    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); 
} 

注:

  1. 不知道爲什麼你會標記這些指針const。他們不以任何方式const(該函數將修改指針以及它指向的數據。)

  2. 在瀏覽器中編碼。你可能需要修補一些其他的東西。由於您沒有提供完整的代碼來處理,我還沒有提供完整的代碼。但這應該是一個路線圖。

  3. 在函數中分配可能是一個等待發生的內存泄漏。你可能想給這個想法。一定要有一個計劃來釋放這些指針,如果你將重用它們或創建它們。

+0

感謝您快速回復@Robert。顯然,我對編碼(特別是_pointers_ :))並不熟悉,只是想在很短的時間內找出一些GPU編程。我遵循你的步驟,它的工作,但這次我在運行時遇到「總線錯誤」。你所說的一切都是正確的,所以我刪除了'const',在主函數中分配了所有內容,並在最後釋放了指針。現在它工作得很好。如果有人感興趣,我可以發佈改進的答案。 – schloxy

相關問題