我是CUDA的新手。當我乘1024x1024的矩陣,並啓動內核:爲什麼我在CUDA程序中得到「未指定的啓動失敗」,將兩個矩陣相乘
multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
但是,當我乘2048×2048矩陣, 爲dim3(64,64,1) 我得到這個錯誤:
cudaDeviceSynchronize returned error code 4 after launching addKernel!
unspecified launch failure
從代碼修修補補,我認爲錯誤是在此聲明
result += a[row * size + ind] * b[col + size * ind];
在部分
b[col+size*ind]
如果我拿出來,我沒有得到一個內核啓動錯誤(顯然是錯誤的答案)。我無法弄清楚什麼是錯的。任何建議將不勝感激。 我正在使用Visual Studio 2013.我正在使用調試器,但這並不能幫助我找到錯誤。
這似乎是一個類似的問題: cudaDeviceSynchronize returned error code 4 after launching
千恩萬謝,這裏是代碼:
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size);
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned int size)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row > size || col > size) return;
// target field in 1-D
int z = row * size + col;
int result = 0;
for (int ind = 0; ind < size ; ++ind) {
result += a[row * size + ind] * b[col + size * ind];
}
c[z] = result;
}
int main(){
const int sizeMatrix = 2048;
int* a = new int[sizeMatrix * sizeMatrix];
int* b = new int[sizeMatrix * sizeMatrix];
int* c = new int[sizeMatrix * sizeMatrix];
for (int i = 0; i < sizeMatrix * sizeMatrix; i++) {
a[i] = rand() % 2;
b[i] = rand() % 2;
}
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
int *dev_a ;
int *dev_b;
int *dev_c;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
fprintf(stdout, "device set");
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for c allocated \n");
cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for a allocated \n");
cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
fprintf(stdout, "buffer for b allocated \n");
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy a done \n");
cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
fprintf(stdout, "cudaMemcpy b done\n");
fprintf(stdout, "about to launch kernel \n");
// Launch a kernel on the GPU with one thread for each element.
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);
fprintf(stdout, "kernel launched\n");
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
所以,你是否嘗試過這個類似的問題的解決方案? – Drop
您可能會在Windows上遇到WDDM [TDR超時](http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm)。您的代碼對我來說運行正常(即沒有運行時錯誤)。如果你將它作爲一個調試項目來構建(很可能,因爲你在調試中運行它),那麼內核需要更長的時間。 –
是的,就是這樣。我將nsight監視器中的WDDR TDR延遲更新爲10秒,現在運行良好。非常感謝你,我永遠不會找到它。 –