CUDA非法內存訪問

我試圖讓這個代碼與3D類型結構一起工作。我正在使用Cuda的2D功能。所以主機端線性數據（'board'）的大小是width * height * depth，而2D mallocs是width x height * depth（這裏的寬度和高度都是DIMxDIM元素）。內核從A的數據以B.我得到一個非法的內存訪問錯誤（使用存儲器校驗），在該行CUDA非法內存訪問

dst[offset] = curr;

錯誤消失，如果我改變的malloc身高* 2，但大小似乎匹配。我錯過了什麼？其他批評也是受歡迎的，我對C++和CUDA都是新手。

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include <iostream> 
#include <stdio.h> 
#include <stdlib.h> 

typedef signed int sint; 
typedef unsigned int uint; 

#define DIM 512 
#define TPB 32 // Threads per block 

#define CLEARANCE 5 
#define MAPLAYERS 2 
#define WIDTH (sizeof(sint) * DIM) 
#define HEIGHT (DIM * MAPLAYERS) 

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest); 
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index); 
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch); 
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff); 
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch); 

dim3 blocks(DIM/TPB, DIM/TPB, MAPLAYERS); 
dim3 threads(TPB, TPB); 

/** CUDA Error Check */ 
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, const char *file, int line) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     int tmp; 
     std::cin >> tmp; 
     exit(code); 
    } 
} 

int main(void) { 

    sint *A; 
    sint *B; 
    size_t pitchA, pitchB; 
    sint *board = new sint[WIDTH*HEIGHT]; 

    CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT)); 
    CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT)); 
    CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT)); 
    CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT)); 

    route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1); 

    CER(cudaFree(A)); 
    CER(cudaFree(B)); 
    delete[] board; 
} 

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) { 
    unsigned long *dev_index; 
    unsigned long index = NULL; 

    CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long))); 
    CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice)); 

    CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice)); 
    CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice)); 

    map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index); 
    CER(cudaPeekAtLastError()); 
    CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost)); 
    if (index != NULL) { 
     // break condition 
    } 

} 

__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) { 
    unsigned int x = threadIdx.x + blockIdx.x * blockDim.x; 
    unsigned int y = threadIdx.y + blockIdx.y * blockDim.y; 
    unsigned int z = blockIdx.z + blockIdx.z * blockDim.z; 
    unsigned long long offset = calcOffset(x, y, z, 0, 0, 0, pitchDst); 

    sint curr; 

    if (!inBounds(x, y, z, 0, 0, 0)) 
     return; 

    curr = src[calcOffset(x, y, z, 0, 0, 0, pitchSrc)]; 
    if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) { 
     // Processing 
    } 
    else 
     dst[offset] = 1; 

    return; 
} 

/** Finds linear offset for a given pixel and offset. */ 
__device__ inline long long calcOffset(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) { 
    return (x + xoff) + (y + yoff) * pitch + ((z + zoff) * pitch * (HEIGHT/MAPLAYERS)); 
} 


/** Checks if position is valid on the map. */ 
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) { 
    if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS) 
     return false; 
    return true; 
} 


/** Returns true if a block has clearnace */ 
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) { 
    for (int c = -CLEARANCE; c <= CLEARANCE; c++) { 
     for (int r = -CLEARANCE; r <= CLEARANCE; r++){ 
      if (inBounds(x, y, z, r, c, 0)){ 
       if (src[calcOffset(x, y, z, r, c, 0, pitch)] == 2 || src[calcOffset(x, y, z, r, c, 0, pitch)] == 1) 
        return false; 
      } 
      else { 
       return false; 
      } 
     } 
    } 
    return true; 
}

的CUDA調試器的輸出：

Memory Checker detected 384 access violations. 
error = access violation on load (global memory) 
gridid = 18 
blockIdx = {0,8,0} 
threadIdx = {0,4,0} 
address = 0x05d08000 
accessSize = 4

來源

2015-04-21 rgadra

請提供其他人可以編譯的代碼。如果您不確定這是什麼，請將這個問題中的代碼複製到一個全新的項目中，並且繼續修復所有編譯錯誤，直到沒有任何編譯錯誤。然後確保代碼演示您詢問的訪問衝突。然後將該固定代碼粘貼回問題中。 –

這看起來不正確：

sint *board = new sint[WIDTH*HEIGHT];

我想你的意思是這樣的：

sint *board = new sint[DIM*HEIGHT];

這並未看起來不錯：

unsigned int z = blockIdx.z + blockIdx.z * blockDim.z;

我想你的意思是這樣的：

unsigned int z = threadIdx.z + blockIdx.z * blockDim.z;

但問題的關鍵是，你正在使用的間距值的算術被計數指標爲（這是計算行寬的字節） sint陣列。當您以這種方式計算指數時，您需要通過sizeof(sint)縮放您的音高值。即使這不是相當正確。正確的做法是將指針轉換爲unsigned char指針，按照行間距（即字節）進行算術運算，然後將行開始指針從unsigned char重新投射到sint，然後通過(x+xoff)從那裏索引。實際上，這意味着您的calcOffset例程需要重寫，並且需要接受基礎指針作爲參數，並返回一個指針。

所以這個代碼有那些變化：

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include <iostream> 
#include <stdio.h> 
#include <stdlib.h> 

typedef signed int sint; 
typedef unsigned int uint; 

#define DIM 512 
#define TPB 32 // Threads per block 

#define CLEARANCE 5 
#define MAPLAYERS 2 
#define WIDTH (sizeof(sint) * DIM) 
#define HEIGHT (DIM * MAPLAYERS) 

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest); 
__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index); 
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch); 
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff); 
__device__ inline sint * calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch); 

dim3 blocks(DIM/TPB, DIM/TPB, MAPLAYERS); 
dim3 threads(TPB, TPB); 

/** CUDA Error Check */ 
#define CER(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, const char *file, int line) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     int tmp; 
     std::cin >> tmp; 
     exit(code); 
    } 
} 

int main(void) { 

    sint *A; 
    sint *B; 
    size_t pitchA, pitchB; 
    sint *board = new sint[DIM*HEIGHT]; 

    CER(cudaMallocPitch(&A, &pitchA, WIDTH, HEIGHT)); 
    CER(cudaMallocPitch(&B, &pitchB, WIDTH, HEIGHT)); 
    CER(cudaMemset2D(A, pitchA, 0, WIDTH, HEIGHT)); 
    CER(cudaMemset2D(B, pitchA, 0, WIDTH, HEIGHT)); 

    route(A, pitchA, B, pitchB, board, 0, DIM*DIM - 1); 

    CER(cudaFree(A)); 
    CER(cudaFree(B)); 
    delete[] board; 
} 

void route(sint *A, size_t &pitchA, sint *B, size_t &pitchB, sint *board, int src, int dest) { 
    unsigned long *dev_index; 
    unsigned long index = 0; 

    CER(cudaMalloc((void**)&dev_index, sizeof(unsigned long))); 
    CER(cudaMemcpy(dev_index, &index, sizeof(unsigned long), cudaMemcpyHostToDevice)); 

    CER(cudaMemcpy2D(A, pitchA, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice)); 
    CER(cudaMemcpy2D(B, pitchB, board, WIDTH, WIDTH, HEIGHT, cudaMemcpyHostToDevice)); 

    map << <blocks, threads >> >(B, pitchB, A, pitchA, dev_index); 
    CER(cudaPeekAtLastError()); 
    CER(cudaMemcpy(&index, dev_index, sizeof(unsigned long), cudaMemcpyDeviceToHost)); 
    if (index != 0) { 
     // break condition 
    } 

} 

__global__ void map(sint *src, size_t pitchSrc, sint *dst, size_t pitchDst, unsigned long *index) { 
    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 
    int z = threadIdx.z + blockIdx.z * blockDim.z; 
    sint *dst_offset = calcOffset(dst, x, y, z, 0, 0, 0, pitchDst); 

    sint curr; 

    if (!inBounds(x, y, z, 0, 0, 0)) 
     return; 

    curr = *calcOffset(src, x, y, z, 0, 0, 0, pitchSrc); 
    if (z % 2 == 0 && curr == 0 && hasClearance(src, x, y, z, pitchSrc)) { 
     // Processing 
    } 
    else 
     *dst_offset = 1; 

    return; 
} 

/** Finds linear offset for a given pixel and offset. */ 
__device__ sint* calcOffset(sint *ptr, sint x, sint y, sint z, sint xoff, sint yoff, sint zoff, size_t pitch) { 
    unsigned char *my_ptr = reinterpret_cast<unsigned char *>(ptr); 
    return (x + xoff) + reinterpret_cast<sint *>(my_ptr + (((y + yoff) * pitch) + ((z + zoff) * pitch * (HEIGHT/MAPLAYERS)))); 
} 


/** Checks if position is valid on the map. */ 
__device__ bool inBounds(sint x, sint y, sint z, sint xoff, sint yoff, sint zoff) { 
    if (0 > (x + xoff) || (x + xoff) >= DIM || 0 > (y + yoff) || (y + yoff) >= DIM || 0 > (z + zoff) || (z + zoff) >= MAPLAYERS) 
     return false; 
    return true; 
} 


/** Returns true if a block has clearnace */ 
__device__ bool hasClearance(sint* src, sint x, sint y, sint z, size_t pitch) { 
    for (int c = -CLEARANCE; c <= CLEARANCE; c++) { 
     for (int r = -CLEARANCE; r <= CLEARANCE; r++){ 
      if (inBounds(x, y, z, r, c, 0)){ 
       if ((*calcOffset(src, x, y, z, r, c, 0, pitch) == 2) || (*calcOffset(src, x, y, z, r, c, 0, pitch)) == 1) 
        return false; 
      } 
      else { 
       return false; 
      } 
     } 
    } 
    return true; 
}

在未來，你可能希望得到您的代碼中使用非音調分配工作。一旦你有事情的工作，你可以看到增加傾斜分配給你任何性能優勢。

它也發生，我認爲即使是這樣，如果(x+xoff)爲負將無法正常工作（或者如果(x+xoff)原因索引到下一個行）。您不能以這種方式在一個傾斜的分配中從一行向後索引到前一行（或到下一行）。首先需要將(x+xoff)解析爲引用的實際行，然後在該行中開發一個索引，然後針對該行進行傾斜計算。

來源

2015-04-21 04:41:06

CUDA非法內存訪問

回答

相關問題