我想分配設備內存,複製到它,在GPU上執行計算,將結果複製回來,然後釋放我分配的設備內存。我想確保我沒有超過極限,我想看看是否在共享內存空間中有足夠的內存來轉儲一些數組。爲什麼CudaFree似乎沒有釋放內存?
當我分配設備內存時,不會返回任何錯誤。當我使用cudaMemGetInfo
來檢查分配的內存量時,它看起來像一個cudaMalloc
尚未分配任何內存。 當我嘗試釋放內存時,它看起來只有一個指針被釋放。
我正在使用matlab Mexfunction
接口來設置GPU內存並啓動內核。在這一點上,我甚至沒有調用內核,只是返回一個單位矩陣的結果。
cudaError_t cudaErr;
size_t freeMem = 0;
size_t totalMem = 0;
size_t allocMem = 0;
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Memory avaliable: Free: %lu, Total: %lu\n",freeMem, totalMem);
/* Pointers for the device memory */
double *devicePulseDelay, *deviceTarDistance, *deviceScattDistance, *deviceScatterers;
double *deviceReceivedReal, *deviceReceivedImag;
/* Allocate memory on the device for the arrays. */
mexPrintf("Allocating memory.\n");
cudaErr = cudaMalloc((void **) &devicePulseDelay, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceTarDistance, sizeof(double)*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceScattDistance, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceScatterers, sizeof(double)*999);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceReceivedReal, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMalloc((void **) &deviceReceivedImag, sizeof(double)*999*512);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not allocate memory to deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n", allocMem, totalMem,(freeMem - allocMem));
/* copy the input arrays across to the device */
mexPrintf("\nCopying memory.\n");
cudaErr = cudaMemcpy(devicePulseDelay, pulseDelay, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceTarDistance, tarDistance, sizeof(double)*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScattDistance, scattDistance, sizeof(double)*999*512,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(deviceScatterers, scatterers, sizeof(double)*999,cudaMemcpyHostToDevice);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* call the kernel */
// launchKernel<<<1,512>>>(........);
/* retireve the output */
cudaErr = cudaMemcpy(receivedReal, deviceReceivedReal, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedReal: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
cudaErr = cudaMemcpy(receivedImag, deviceReceivedImag, sizeof(double)*512*512,cudaMemcpyDeviceToHost);
if (cudaErr != cudaSuccess)
{
mexPrintf("could not copy to receivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("receivedImag: Memory avaliable: Free: %lu, Total: %lu, Consumed: %lu\n",allocMem, totalMem,(freeMem - allocMem));
/* free the memory. */
mexPrintf("\nFree'ing memory.\n");
cudaMemGetInfo(&freeMem, &totalMem);
mexPrintf("Before freeing: Free %lu, Total: %lu\n", freeMem, totalMem);
cudaErr = cudaFree(devicePulseDelay);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free devicePulseDelay\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("devicePulseDelay: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceTarDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceTarDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceTarDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScattDistance);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScattDistance\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScattDistance: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceScatterers);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceScatterers\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceScatterers: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedReal);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedReal\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedReal: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
cudaErr = cudaFree(deviceReceivedImag);
if (cudaErr != cudaSuccess)
{
mexPrintf("could free deviceReceivedImag\n");
mexPrintf("Error: %s\n",cudaGetErrorString(cudaErr));
}
cudaMemGetInfo(&allocMem, &totalMem);
mexPrintf("deviceReceivedImag: Memory avaliable: Free: %lu, Total: %lu, Free'd: %lu\n",allocMem, totalMem,(allocMem - freeMem));
下面是從這個輸出:
Memory avaliable: Free: 2523959296, Total: 2818572288 Allocating memory. devicePulseDelay: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceTarDistance: Memory avaliable: Free: 2522910720, Total: 2818572288, Consumed: 1048576 deviceScattDistance: Memory avaliable: Free: 2518716416, Total: 2818572288, Consumed: 5242880 deviceScatterers: Memory avaliable: Free: 2517667840, Total: 2818572288, Consumed: 6291456 deviceReceivedReal: Memory avaliable: Free: 2515570688, Total: 2818572288, Consumed: 8388608 deviceReceivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Copying memory. devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 deviceScatterers: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedReal: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 receivedImag: Memory avaliable: Free: 2513473536, Total: 2818572288, Consumed: 10485760 Free'ing memory. Before freeing: Free 2513473536, Total: 2818572288 devicePulseDelay: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceTarDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScattDistance: Memory avaliable: Free: 2513473536, Total: 2818572288, Free'd: 0 deviceScatterers: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedReal: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576 deviceReceivedImag: Memory avaliable: Free: 2514522112, Total: 2818572288, Free'd: 1048576
我覺得有一些明顯的我失蹤。任何人都可以幫助解釋發生了什麼?
編輯:平臺是Windows 7與特斯拉C2050 GPu卡。
你在哪個平臺上運行這個代碼? – talonmies
嘗試在每次調用cudaMemGetInfo()之前將allocMem和totalMem的值清零並檢查cudaMemGetInfo()的返回值。 –
在每次調用cudaMemGetInfo()之前清零allocMem和totalMem沒有任何區別。此外,cudaMemGetInfo調用沒有返回任何錯誤。僅供參考,我的平臺是Tesla C2050 GPU卡上的Windows 7。 –