cudaThreadSynchronise（）返回的錯誤碼6

我試圖運行CUDA中cudaThreadSynchronise（）返回的錯誤碼6

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 
/* a is the array that holds the values and c is the array used to store the maximum in a block */ 
cudaError_t reduce_max(int *a,int *c,int size); 

/*The kernel that performs the reduction */ 
__global__ void global_max(int *d_c, int * d_a) 
{ 
    int myId=threadIdx.x+blockDim.x*blockIdx.x; 
    int tid=threadIdx.x; 
    for(int s=(blockDim.x)/2; s>0; s>>1) 
    { 
     if(tid<s) 
     { 
      d_a[myId]=max(d_a[myId],d_a[myId+s]); 
     } 
     __syncthreads(); 
    } 
    if(tid==0) 
    { 
     d_c[blockIdx.x]=d_a[myId]; 
    } 
} 

int main() 
{ 
    const int arraySize = 1024; 
    int i; 
    int a[arraySize]; 
    for(i=0;i<arraySize;i++) 
    { 
     a[i]=i; 
    } 
    int c[arraySize]; 
    cudaError_t cudaStatus = reduce_max(a,c,arraySize); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "The required operation failed"); 
     return 1; 
    } 
    cudaStatus = cudaThreadExit(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaThreadExit failed!"); 
     return 1; 
    } 

    return 0; 
} 

// Helper function for using CUDA to add vectors in parallel. 
cudaError_t reduce_max(int *a,int *c,int size) 
{ 
    int *dev_a = 0; 
    int *dev_c = 0; 
    /* 
    dev_a and dev_c are the arrays on the device 
    */ 
    cudaError_t cudaStatus; 
    const dim3 blockSize(64,1,1); 
    const dim3 gridSize(size/blockSize.x,1,1); 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    cudaStatus = cudaSetDevice(0); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
     goto Error; 
    } 

    /*Allocating the memory on the device */ 
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    /*Copying array from host to device */ 
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

    /*Calling the kernel */ 
    global_max<<<gridSize,blockSize>>>(dev_c, dev_a); 

    cudaStatus = cudaThreadSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaThreadSynchronize returned error code %d\n", cudaStatus); 
     goto Error; 
    } 

    // Copy output vector from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

Error: 
    cudaFree(dev_c); 
    cudaFree(dev_a); 
    return cudaStatus; 
}

使用並行還原上執行上面的代碼查找陣列的最大元素。然而代碼我得到錯誤：的cudaThreadSynchronize返回的錯誤代碼6.

我無法弄清楚問題所在。

來源

2017-01-23 akshita007

錯誤代碼6不表示很多。您可以通過傳遞錯誤代碼通過解析器（'cudaGetErrorString'）來獲得更多有用的信息，如[這裏]所示（http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to -check換錯誤 - 使用最CUDA的運行時API）。你也可以用'cuda-memcheck'運行你的代碼，這可能會提供更多有用的信息。 –

關於使用cudaGetErrorString我得到:(cudaThreadSynchronize返回的錯誤代碼啓動超時並被終止） – akshita007

您的代碼將永久運行。結果你正在超時。

這條線斷了，和你的編譯器應該拋出一個警告：

for(int s=(blockDim.x)/2; s>0; s>>1)

s>>1不修改s變量。我敢肯定你的意思是s>>=1，它修改s。在不修改s的情況下，您的循環會永久運行，並因此導致內核超時。

而是執行此操作：

for(int s=(blockDim.x)/2; s>0; s>>=1)

來源

2017-01-23 18:49:13

是的，這工作。謝謝很多！ – akshita007

cudaThreadSynchronise（）返回的錯誤碼6

回答

相關問題