我的GPU是Quadro K1000M(移動計算能力3.0)。我與CUDA版本的工作8.

#include <iostream> 

// nvcc -ccbin g++ -g -m64 -gencode arch=compute_30,code=sm_30 -o Bug Bug.cu 

// Helper Functions Decl 
void allocateDeviceMemory(void* devPtr , unsigned size , int lineNumber); 
void copyDataToHost(void* hostPtr , void* devPtr , unsigned size , int lineNumber); 
void copyDataToDevice(void* devPtr , void* hostPtr , unsigned size , int lineNumber); 
void initializeDeviceMemory(void* devPtr , unsigned size , unsigned initValue , int lineNumber); 

__global__ void myKernel(const ushort* __restrict__ dataPtr , const ushort* __restrict__ proxyId , bool* unique , unsigned size , const ushort dim) 
    int N = threadIdx.x + (blockIdx.x * blockDim.x); 

    if(N < size - 1) 
     unsigned offset; 
     ushort countPtr = 0; 
     ushort id1 = proxyId[N]; 
     ushort id2 = proxyId[N + 1]; 

     for(ushort i = 0; i < dim; ++i) 
      if(dataPtr[offset + id1] == dataPtr[offset + id2]) ++countPtr; 
      offset += size; 

     unique[N + 1] = (countPtr != dim); // No crash if commented out 

int main(int argc, char** argv) 
    ushort dim = 2; 
    static const unsigned SIZE = 10; 

    ushort h_proxyId[SIZE] = { 6 , 3 , 1 , 0 , 7 , 4 , 2 , 8 , 5 , 9 }; 

    ushort h_dataPtr[ 2 * SIZE] = { 1 , 1 , 2 , 1 , 2 , 3 , 1 , 2 , 3 , 4 , 
            4 , 3 , 3 , 2 , 2 , 2 , 1 , 1 , 1 , 1 }; 

    ushort* d_proxyId = 0; 
    ushort* d_dataPtr = 0; 

    bool* d_unique = 0; 
    bool* h_unique = new bool[SIZE]; 

    allocateDeviceMemory(&d_unique , SIZE , __LINE__);  
    allocateDeviceMemory(&d_proxyId , SIZE * sizeof(ushort) , __LINE__); 
    allocateDeviceMemory(&d_dataPtr , SIZE * sizeof(ushort) * 2 , __LINE__); 

    copyDataToDevice(d_proxyId , h_proxyId , SIZE * sizeof(ushort) , __LINE__); 
    copyDataToDevice(d_dataPtr , h_dataPtr , SIZE * sizeof(ushort) * 2 , __LINE__); 

    initializeDeviceMemory(d_unique , SIZE , 1 , __LINE__); 

    myKernel<<<1,SIZE>>>(d_dataPtr , d_proxyId , d_unique , SIZE , dim); // No crash if commented out 

    copyDataToHost(h_unique , d_unique , SIZE , __LINE__); // Crashes here 

    return 0; 

// Helper Functions Impl 
void allocateDeviceMemory(void* devPtr , unsigned size , int lineNumber) 
    cudaError_t error = cudaMalloc((void**) devPtr , size); 
    if(error != cudaSuccess) 
     std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to allocate device memory] " << cudaGetErrorString(error) << std::endl; 

void initializeDeviceMemory(void* devPtr , unsigned size , unsigned initValue , int lineNumber ) 
    cudaError_t error = cudaMemset(devPtr , initValue , size); 
    if(error != cudaSuccess) 
     std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to initialize device memory to default value] " << cudaGetErrorString(error) << std::endl; 

void copyDataToHost(void* hostPtr , void* devPtr , unsigned size , int lineNumber ) 
    cudaError_t error = cudaMemcpy(hostPtr , devPtr , size , cudaMemcpyDeviceToHost); 
    if(error != cudaSuccess) 
     std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to copy device data to host memory] " << cudaGetErrorString(error) << std::endl; 

void copyDataToDevice(void* devPtr , void* hostPtr , unsigned size , int lineNumber ) 
    cudaError_t error = cudaMemcpy(devPtr , hostPtr , size , cudaMemcpyHostToDevice); 
    if(error != cudaSuccess) 
     std::cout << "[Line " << lineNumber << " -- Error " << error << " : Unable to copy host data to device memory] " << cudaGetErrorString(error) << std::endl; 


ThinkPad-W530:~/tmp/CUDA/Prototype$ cuda-memcheck ./Bug 
========= CUDA-MEMCHECK 
[Line 59 -- Error 4 : Unable to copy device data to host memory] unspecified launch failure 
========= Internal error (7) 
========= No CUDA-MEMCHECK results found 

你永遠不會將'offset'初始化爲內核中已知的值。 –


@RobertCrovella你完全正確,謝謝。請張貼您的評論作爲答案,我會標記它。 – Olumide


https://stackoverflow.com/q/10305715/681865 – talonmies



它實際上可能不是cudaMemcpy錯誤,但而不是你的內核啓動。 Cuda錯誤是持久的,你從內核啓動而不是內存拷貝中獲得了一個潛在的錯誤。你可以在內核之後運行cudaGetLastError()來驗證。

