我正在努力解決一些內存管理問題。將結果複製到主機時,我一直收到「未指定的啓動失敗」。CUDA內存管理/類問題指針
我的代碼很簡單 - 它在每個線程中生成兩個提示並將它們相乘。 我有類提供一個隨機數:
class CuRandCuRandomNumberProvider :
{
public:
CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock);
CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock, unsigned int seed);
__device__ unsigned int GetRandomNumber();
~CuRandCuRandomNumberProvider();
protected:
curandState * states;
__device__ bool IsPrime(unsigned int number);
};
CuRandCuRandomNumberProvider::CuRandCuRandomNumberProvider(dim3 numBlocks, dim3 threadsPerBlock)
{
int numberOfThreads = threadsPerBlock.x * threadsPerBlock.y * numBlocks.x * numBlocks.y;
std::cout << numberOfThreads << std::endl;
cudaMalloc (&this->states, numberOfThreads*sizeof(curandState));
setup_kernel <<< numBlocks, threadsPerBlock >>> (this->states, time(NULL));
}
__device__ unsigned int CuRandCuRandomNumberProvider::GetRandomNumber()
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&this->states[offset]);
return 0 + ((double)UINT_MAX) * r;
}
setup_kernel存儲在頭文件,看起來像這樣:
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
curand_init (seed, offset, 0, &state[offset]);
}
我的主要核心是非常簡單的,看起來像這樣:
最後cudaMemcpy導致問題的主執行是:
uint3 * pqnD;
uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads);
memset(pqnH,0,sizeof(uint3) * numberOfThreads);
HANDLE_ERROR(cudaMalloc((void**)&pqnD, sizeof(uint3) * numberOfThreads));
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock);
InitKernel<<<numBlocks, threadsPerBlock>>>(pqnD, provider);
HANDLE_ERROR(cudaMemcpy(pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost)); // this line causes error
HANDLE_ERROR(cudaFree(pqnD));
如果我做的一切explicily,如:
uint3 * pqnD;
uint3 * pqnH = (uint3*)malloc(sizeof(uint3) * numberOfThreads);
memset(pqnH,0,sizeof(uint3) * numberOfThreads);
HANDLE_ERROR(cudaMalloc((void**)&pqnD, sizeof(uint3) * numberOfThreads));
curandState * states;
cudaMalloc (&states, numberOfThreads*sizeof(curandState));
setup_kernel <<< numBlocks, threadsPerBlock >>> (states, time(NULL));
CuRandCuRandomNumberProvider * provider = new CuRandCuRandomNumberProvider(numBlocks, threadsPerBlock, states);
InitKernel2<<<numBlocks, threadsPerBlock>>>(pqnD, states);
HANDLE_ERROR(cudaMemcpy(pqnH, pqnD, sizeof(uint3) * numberOfThreads, cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(pqnD));
哪裏setup_kernel是完全一樣的,並InitKernel2樣子:
__global__ void InitKernel2(uint3 * ptr, curandState * states)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
ptr[offset].x = GetRandomNumber(states);
ptr[offset].y = GetRandomNumber(states);
ptr[offset].z = ptr[offset].x * ptr[offset].y;
}
和getRandomNumber的是:
__device__ unsigned int GetRandomNumber(curandState * states)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
register float r = curand_uniform(&states[offset]);
return 0 + ((double)UINT_MAX) * r;
}
一切正常作爲魅力。有沒有人有線索我做錯了什麼?我一直在掙扎幾個小時。我的事情可能是內存管理或指針傳遞的東西,但我不知道它會是什麼。
請幫忙:)!
你應該爲這樣的問題提供一個MCVE。 – 2014-12-07 23:18:49