我稱爲simpleMultiCopy.cu
在CUDA SDK 4.0
並寫一個,參見下面的代碼。
simpleMultiCopy.cu
是操作在循環中重疊的示例。和我的類似,它會發送一部分數據到GPU
來計算循環中的每個迭代,我在這裏執行重疊操作。
這只是一個測試/演示,不關心內核的邏輯(increment_kernel
),它只是用來延遲一段時間。主要邏輯在於processWithStreams
函數。 但這個程序與此放出來的作品不正確:
i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.
線132:
CUDA_SAFE_CALL(cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream])); //this is line 132
我沒有關於如何CUDA
作品,所以請大家幫忙多的想法。
任何幫助將不勝感激。
代碼:
#include <stdio.h>
#include <cutil_inline.h>
float processWithStreams(int streams_used);
#define STREAM_COUNT 2
int N = 1 << 24;
int *h_data_source;
int *h_data_sink;
int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];
int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];
cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];
cudaEvent_t start, stop;
dim3 block(512);
dim3 grid;
int memsize;
__global__ void increment_kernel(int *g_data, int inc_value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//g_data[idx] = g_data[idx] + inc_value;
int i = blockDim.x * gridDim.x;
for(; i > 0; i /= 2)
{
if(idx > i)
g_data[idx]++;
}
}
int main(int argc, char *argv[])
{
if(cutCheckCmdLineFlag(argc, (const char**)argv, "device"))
cutilDeviceInit(argc, argv);
else
cudaSetDevice(cutGetMaxGflopsDeviceId());
h_data_source = (int *)malloc(sizeof(int) * N);
memset(h_data_source, 0, sizeof(int) * N);
int i;
memsize = 1024 * 1024 * sizeof(int);
for(i = 0; i < STREAM_COUNT; i++)
{
CUDA_SAFE_CALL(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
CUDA_SAFE_CALL(cudaMalloc(&d_data_in[i], memsize));
CUDA_SAFE_CALL(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
CUDA_SAFE_CALL(cudaMalloc(&d_data_out[i], memsize));
CUDA_SAFE_CALL(cudaStreamCreate(&stream[i]));
CUDA_SAFE_CALL(cudaEventCreate(&cycleDone[i]));
cudaEventRecord(cycleDone[i], stream[i]);
}
CUDA_SAFE_CALL(cudaEventCreate(&start));
CUDA_SAFE_CALL(cudaEventCreate(&stop));
grid.x = N/block.x;
grid.y = 1;
float time1 = processWithStreams(STREAM_COUNT);
printf("time: %f\n", time1);
free(h_data_source);
free(h_data_sink);
for(i = 0; i < STREAM_COUNT; ++i) {
cudaFreeHost(h_data_in[i]);
cudaFree(d_data_in[i]);
cudaStreamDestroy(stream[i]);
cudaEventDestroy(cycleDone[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaThreadExit();
cutilExit(argc, argv);
return 0;
}
float processWithStreams(int streams_used) {
int current_stream = 0;
float time;
cudaEventRecord(start, 0);
for(int i=0; i < N/1024/1024; ++i) {
int next_stream = (current_stream + 1) % streams_used;
printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame
increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
d_data_in[current_stream], 1);
// Upload next frame
CUDA_SAFE_CALL(cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream]));
CUDA_SAFE_CALL(cudaEventRecord(
cycleDone[next_stream],
stream[next_stream]));
// Download current frame
CUDA_SAFE_CALL(cudaMemcpyAsync(
h_data_out[current_stream],
d_data_out[current_stream],
memsize,
cudaMemcpyDeviceToHost,
stream[current_stream]));
CUDA_SAFE_CALL(cudaEventRecord(
cycleDone[current_stream],
stream[current_stream]));
current_stream = next_stream;
}
cudaEventRecord(stop, 0);
cudaEventElapsedTime(&time, start, stop);
return time;
}
同意重新標記。刪除。 – ChrisV 2011-06-08 15:42:09
真棒。非常感謝。對不起,錯了標籤。的確,我的意思是一些重疊的東西。我對英文單詞不敏感,很多時候,重載只是看起來像我認爲的重疊。非常感謝@chrisv或@chrisV或@ChrisV。 – user435657 2011-06-09 01:52:27