我一直在使用cudaHostRegister和cudaHostUnregister函數,我注意到後者需要很長時間。即使將cudaHostUnregister與同一數據上的cudaMemcpy進行比較,也需要很長時間,即使沒有爲memcpy使用頁面鎖定內存。CUDA 4.0 - cudaHostUnregister很慢
我已經提出了以下的短節目:
#include <stdio.h>
#include <time.h>
#include <assert.h>
#include <stdlib.h>
static struct timespec tp;
static clockid_t clk = CLOCK_REALTIME;
static void tu_timer_start(void)
{
int res = clock_gettime(clk, &tp);
assert(!res);
}
static long long tu_timer_stop(void)
{
struct timespec tp_new;
long long elapsed;
int res = clock_gettime(clk, &tp_new);
assert(!res);
elapsed = 1000000000LL * (tp_new.tv_sec - tp.tv_sec) + tp_new.tv_nsec - tp.tv_nsec;
tp = tp_new;
return elapsed;
}
int main() {
const int length = 999424;
const int pagesize = 4096;
// Allocating page-aligned host data and filling it with zeroes.
int *paged, *locked;
posix_memalign((void**) &paged, pagesize, length * sizeof(int));
posix_memalign((void**) &locked, pagesize, length * sizeof(int));
memset(paged, 0, length * sizeof(int));
memset(locked, 0, length * sizeof(int));
// Allocating device data.
int *devPaged, *devLocked;
tu_timer_start();
printf("%20d\n", cudaMalloc(&devPaged, length * sizeof(int)));
printf("%20d\n", cudaMalloc(&devLocked, length * sizeof(int)));
printf("Initialization: %12lld ns\n", tu_timer_stop());
// Measure copy time with pageable data.
tu_timer_start();
printf("%20d\n", cudaMemcpy(devPaged, paged, length * sizeof(int), cudaMemcpyHostToDevice));
printf("Copy pageable: %12lld ns\n", tu_timer_stop());
// Measure time to page-lock host data.
tu_timer_start();
printf("%20d\n", cudaHostRegister(locked, length * sizeof(int), 0));
printf("Host register: %12lld ns\n", tu_timer_stop());
// Measure copy time with page-locked data.
tu_timer_start();
printf("%20d\n", cudaMemcpy(devLocked, locked, length * sizeof(int), cudaMemcpyHostToDevice));
printf("Copy page-locked: %12lld ns\n", tu_timer_stop());
// Measure time to release page-lock on host data.
tu_timer_start();
cudaHostUnregister(locked);
printf("Host unregister: %12lld ns\n", tu_timer_stop());
return 0;
}
這給出了在四核以下輸出英特爾I5 760不打印(每核心2.80千兆赫)用特斯拉C2050(具有CUDA返回代碼這裏):
Initialization: 81027005 ns
Copy pageable: 1263236 ns
Host register: 436132 ns
Copy page-locked: 706051 ns
Host unregister: 2139736 ns
這顯示我的問題。在我的實際程序中,情況更糟,我經常測量cudaHostUnregister,耗時約3460000 ns。這表明它在併發異步memcopies或內核運行方面表現不佳,除此之外,速度很慢。
爲什麼這個功能需要這麼長時間,有沒有辦法加快速度?它是否真的不能與memcopies和內核並行工作,如果是這樣,爲什麼不呢?
還是有一個更好的並行化memcopies和內核運行的方式?
交叉點http://forums.nvidia.com/index.php?showtopic=210296。 –