爲什麼我在CUDA程序中得到「未指定的啓動失敗」，將兩個矩陣相乘

我是CUDA的新手。當我乘1024x1024的矩陣，並啓動內核：爲什麼我在CUDA程序中得到「未指定的啓動失敗」，將兩個矩陣相乘

multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size);

但是，當我乘2048×2048矩陣，爲dim3（64,64,1）我得到這個錯誤：

cudaDeviceSynchronize returned error code 4 after launching addKernel! 
unspecified launch failure

從代碼修修補補，我認爲錯誤是在此聲明

result += a[row * size + ind] * b[col + size * ind];

在部分

b[col+size*ind]

如果我拿出來，我沒有得到一個內核啓動錯誤（顯然是錯誤的答案）。我無法弄清楚什麼是錯的。任何建議將不勝感激。我正在使用Visual Studio 2013.我正在使用調試器，但這並不能幫助我找到錯誤。

這似乎是一個類似的問題： cudaDeviceSynchronize returned error code 4 after launching

千恩萬謝，這裏是代碼：

cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size); 
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned  int size) 
{ 
int row = blockIdx.y * blockDim.y + threadIdx.y; 
int col = blockIdx.x * blockDim.x + threadIdx.x; 

if (row > size || col > size) return; 

// target field in 1-D 
int z = row * size + col; 


int result = 0; 
for (int ind = 0; ind < size ; ++ind) { 

    result += a[row * size + ind] * b[col + size * ind]; 

} 
c[z] = result; 

} 

int main(){ 


const int sizeMatrix = 2048; 
int* a = new int[sizeMatrix * sizeMatrix]; 
int* b = new int[sizeMatrix * sizeMatrix]; 
int* c = new int[sizeMatrix * sizeMatrix]; 



for (int i = 0; i < sizeMatrix * sizeMatrix; i++) { 
    a[i] = rand() % 2; 
    b[i] = rand() % 2; 
} 
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "addWithCuda failed!"); 
    return 1; 
} 


cudaStatus = cudaDeviceReset(); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaDeviceReset failed!"); 
    return 1; 
} 

return 0; 
} 


cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned  int size) 
{ 
int *dev_a ; 
int *dev_b; 
int *dev_c; 
cudaError_t cudaStatus; 




// Choose which GPU to run on, change this on a multi-GPU system. 
cudaStatus = cudaSetDevice(0); 
fprintf(stdout, "device set"); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
    goto Error; 
} 

// Allocate GPU buffers for three vectors (two input, one output) . 
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 
fprintf(stdout, "buffer for c allocated \n"); 

cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 
fprintf(stdout, "buffer for a allocated \n"); 

cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 
fprintf(stdout, "buffer for b allocated \n"); 


// Copy input vectors from host memory to GPU buffers. 
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 
fprintf(stdout, "cudaMemcpy a done \n"); 


cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 
fprintf(stdout, "cudaMemcpy b done\n"); 

fprintf(stdout, "about to launch kernel \n"); 


// Launch a kernel on the GPU with one thread for each element. 
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size); 


fprintf(stdout, "kernel launched\n"); 


// Check for any errors launching the kernel 
cudaStatus = cudaGetLastError(); 
if (cudaStatus != cudaSuccess) { 
    ; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
    goto Error; 
} 

// cudaDeviceSynchronize waits for the kernel to finish, and returns 
// any errors encountered during the launch. 
cudaStatus = cudaDeviceSynchronize(); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
    fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus)); 

    goto Error; 
} 

// Copy output vector from GPU buffer to host memory. 
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 



Error: 
    cudaFree(dev_c); 
    cudaFree(dev_a); 
    cudaFree(dev_b); 

    return cudaStatus; 
}

來源

2016-02-05 Old_Mortality

所以，你是否嘗試過這個類似的問題的解決方案？ – Drop

您可能會在Windows上遇到WDDM [TDR超時]（http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm）。您的代碼對我來說運行正常（即沒有運行時錯誤）。如果你將它作爲一個調試項目來構建（很可能，因爲你在調試中運行它），那麼內核需要更長的時間。 –

是的，就是這樣。我將nsight監視器中的WDDR TDR延遲更新爲10秒，現在運行良好。非常感謝你，我永遠不會找到它。 –

在Windows上，我點擊右鍵在系統托盤中的NSight監視器圖標。在那裏我選擇了選項>常規。我們看到WDDM TDR延遲。當時是2，我把它增加到了10.然後，我再次運行我的程序，它運行良好。這是根據羅伯特的鏈接（見上） http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm

來源

2016-02-08 19:49:58

爲什麼我在CUDA程序中得到「未指定的啓動失敗」，將兩個矩陣相乘

回答

相關問題