2016-02-05 17 views
1

我是CUDA的新手。當我乘1024x1024的矩陣,並啓動內核:爲什麼我在CUDA程序中得到「未指定的啓動失敗」,將兩個矩陣相乘

multiplyKernel << <dim3(32,32, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size); 

但是,當我乘2048×2048矩陣, 爲dim3(64,64,1) 我得到這個錯誤:

cudaDeviceSynchronize returned error code 4 after launching addKernel! 
unspecified launch failure 

從代碼修修補補,我認爲錯誤是在此聲明

result += a[row * size + ind] * b[col + size * ind]; 

在部分

b[col+size*ind] 

如果我拿出來,我沒有得到一個內核啓動錯誤(顯然是錯誤的答案)。我無法弄清楚什麼是錯的。任何建議將不勝感激。 我正在使用Visual Studio 2013.我正在使用調試器,但這並不能幫助我找到錯誤。

這似乎是一個類似的問題: cudaDeviceSynchronize returned error code 4 after launching

千恩萬謝,這裏是代碼:

cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned int size); 
__global__ void multiplyKernel(int *c, const int *a, const int *b, unsigned  int size) 
{ 
int row = blockIdx.y * blockDim.y + threadIdx.y; 
int col = blockIdx.x * blockDim.x + threadIdx.x; 

if (row > size || col > size) return; 

// target field in 1-D 
int z = row * size + col; 


int result = 0; 
for (int ind = 0; ind < size ; ++ind) { 

    result += a[row * size + ind] * b[col + size * ind]; 

} 
c[z] = result; 

} 

int main(){ 


const int sizeMatrix = 2048; 
int* a = new int[sizeMatrix * sizeMatrix]; 
int* b = new int[sizeMatrix * sizeMatrix]; 
int* c = new int[sizeMatrix * sizeMatrix]; 



for (int i = 0; i < sizeMatrix * sizeMatrix; i++) { 
    a[i] = rand() % 2; 
    b[i] = rand() % 2; 
} 
cudaError_t cudaStatus = multiplyWithCuda(c, a, b, sizeMatrix); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "addWithCuda failed!"); 
    return 1; 
} 


cudaStatus = cudaDeviceReset(); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaDeviceReset failed!"); 
    return 1; 
} 

return 0; 
} 


cudaError_t multiplyWithCuda(int *c, const int *a, const int *b, unsigned  int size) 
{ 
int *dev_a ; 
int *dev_b; 
int *dev_c; 
cudaError_t cudaStatus; 




// Choose which GPU to run on, change this on a multi-GPU system. 
cudaStatus = cudaSetDevice(0); 
fprintf(stdout, "device set"); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
    goto Error; 
} 

// Allocate GPU buffers for three vectors (two input, one output) . 
cudaStatus = cudaMalloc((void**)&dev_c, size * size * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 
fprintf(stdout, "buffer for c allocated \n"); 

cudaStatus = cudaMalloc((void**)&dev_a, size * size * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 
fprintf(stdout, "buffer for a allocated \n"); 

cudaStatus = cudaMalloc((void**)&dev_b, size * size * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 
fprintf(stdout, "buffer for b allocated \n"); 


// Copy input vectors from host memory to GPU buffers. 
cudaStatus = cudaMemcpy(dev_a, a, size * size * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 
fprintf(stdout, "cudaMemcpy a done \n"); 


cudaStatus = cudaMemcpy(dev_b, b, size * size * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 
fprintf(stdout, "cudaMemcpy b done\n"); 

fprintf(stdout, "about to launch kernel \n"); 


// Launch a kernel on the GPU with one thread for each element. 
multiplyKernel << <dim3(64,64, 1), dim3(32, 32, 1) >> >(dev_c, dev_a, dev_b, size); 


fprintf(stdout, "kernel launched\n"); 


// Check for any errors launching the kernel 
cudaStatus = cudaGetLastError(); 
if (cudaStatus != cudaSuccess) { 
    ; fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
    goto Error; 
} 

// cudaDeviceSynchronize waits for the kernel to finish, and returns 
// any errors encountered during the launch. 
cudaStatus = cudaDeviceSynchronize(); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
    fprintf(stderr, " %s\n", cudaGetErrorString(cudaStatus)); 

    goto Error; 
} 

// Copy output vector from GPU buffer to host memory. 
cudaStatus = cudaMemcpy(c, dev_c, size * size * sizeof(int), cudaMemcpyDeviceToHost); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 



Error: 
    cudaFree(dev_c); 
    cudaFree(dev_a); 
    cudaFree(dev_b); 

    return cudaStatus; 
} 
+0

所以,你是否嘗試過這個類似的問題的解決方案? – Drop

+2

您可能會在Windows上遇到WDDM [TDR超時](http://http.developer.nvidia.com/NsightVisualStudio/2.2/Documentation/UserGuide/HTML/Content/Timeout_Detection_Recovery.htm)。您的代碼對我來說運行正常(即沒有運行時錯誤)。如果你將它作爲一個調試項目來構建(很可能,因爲你在調試中運行它),那麼內核需要更長的時間。 –

+0

是的,就是這樣。我將nsight監視器中的WDDR TDR延遲更新爲10秒,現在運行良好。非常感謝你,我永遠不會找到它。 –

回答