CUDA_SAFE_CALL：遇到非法內存訪問

-1

我正在嘗試在CUDA上進行簡單的矩陣乘法。我知道陣列可以平鋪以傳遞給設備。不過，我正在使用cudaMallocPitch和cudaMemcpy2d來進行乘法運算。在執行下面的代碼時，當我嘗試將結果複製到主機上時，出現「遇到非法內存」的錯誤，我高度讚賞關於哪裏出錯的建議。謝謝！CUDA_SAFE_CALL：遇到非法內存訪問

權重-第一矩陣，暗淡：30x784

輸入 - 第二矩陣，暗淡：784x100

results_d - 導致在設備上（GPU）

結果 - 結果在主機上拷貝

#include <stdio.h> 
#include <math.h> 
#include <cstdio> 
#include <cstdlib> 

#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch) 
{ 
int row = threadIdx.x; 
int col= threadIdx.y; 
double value; 
double *result_matrix; 

result_matrix = ((double*)((char*)results_d + row*result_pitch + col)); 


printf("%d",threadIdx); 

for(int i =0 ; i < in_pitch ; i++) 

{ 

double *element1 = ((double*)((char*)input + row*in_pitch) + i) ; 
double *element2 = ((double*)((char*)weights + i*w1_pitch) + col); 

value =+ (*element1) * (*element2); 

} 

*result_matrix = value; 

} 





int main() 
{ 

static double arr1[30][784]; 
static double arr2[784][100]; 
static double result[30][100]; 



for (int i = 0 ; i < 30; i++) 

{ 
for(int j =0;j <784 ; j ++) 
arr1[i][j] = 5; 

} 

for (int i =0 ; i < 784; i ++) 
{ 

for(int j=0;j < 100 ; j++) 
arr2[i][j] = 3; 

} 



double *input; 
double *weights; 
double *results_d; 

size_t in_pitch,w1_pitch,result_pitch; 



//allocating memory in GPU for 2 inputs and result 
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,100*sizeof(double),784)); 
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,784*sizeof(double),30)); 
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,100*sizeof(double),30)); 

//Copy matrix from host to device 
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,100*sizeof(double),100*sizeof(double),784,cudaMemcpyHostToDevice)); 
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,784*sizeof(double),784*sizeof(double),30,cudaMemcpyHostToDevice)); 
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,100*sizeof(double),100*sizeof(double),30,cudaMemcpyHostToDevice)); 


//using GPU 


    dim3 dimGrid(1,1,1); 
    dim3 dimBlock(32,32,1); 
    printf("before kernel fucntion"); 
    MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch);  
    printf("after kernel fucntion"); 
    cudaThreadSynchronize(); 

//copying back to host 
CUDA_SAFE_CALL(cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)); 


//printing and seeing whether the result matrix has been updated  
for (int i =0 ; i < 100; i ++) 
{ 

for(int j=0;j < 30 ; j++) 
{ 
printf("%f",result); 

} 
printf("\n"); 
} 

CUDA_SAFE_CALL(cudaFree(input)); 
CUDA_SAFE_CALL(cudaFree(weights)); 
CUDA_SAFE_CALL(cudaFree(results_d)); 


return 0; 
}

來源

2017-04-22 Shiva Perumal

請正確格式化您的代碼。它是完全無法讀取的 – talonmies

請閱讀[文檔]（http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c）查找'cudaMallocPitch'。該函數返回的'pitch'值是**字節**中的一個值。你不能合理地使用它作爲矩陣乘法的循環索引。此外'pitch'給出了整個分配的寬度。您的循環變量應該只遍歷定義的行/列長度（即在這種情況下爲784），您似乎沒有將其傳遞給內核。 –

此代碼中有許多錯誤。首先，目前還不清楚，做這樣的分配會給這裏帶來什麼好處。其次，如果你想要快速矩陣乘法性能，你應該使用CUBLAS。

問題：

你似乎並不瞭解投分配。返回的pitch值是字節中的值。你不能合理地將它用於矩陣乘法的循環索引。而且，pitch值是音高分配的總寬度。它不對應有效的數據區域。爲此，您應該使用適當的矩陣維度。
您的代碼不會在整個矩陣區域上執行矩陣乘法。您只創建一個32x32線程的單個塊，但您需要足夠的塊/線程來覆蓋整個矩陣區域。這需要對網格維度進行更改，將矩陣維度傳遞給內核，並在內核中進行「線程檢查」以防止出現邊界訪問。
此構建投訪問是不正確的：
```
result_matrix = ((double*)((char*)results_d + row*result_pitch + col)); 
```
不匹配，你有2點輸入矩陣的其它結構，它有一個錯位的右括號。
你有兩個輸入矩陣相反的意思。您正在索引到input矩陣，就好像它是weight矩陣，反之亦然。我們需要交換row，column和i的含義以使它們與實際的矩陣尺寸相匹配。
你最終cudaMemcpy2D操作的間距值逆轉：

cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)

    ^^^^^     ^^^^^

你忘了初始化爲零的循環和變量：

double value;

我不知道你打算在這裏是什麼，應該是+=沒有=+：

value =+ ...

下面的代碼已經解決了這些問題，而且似乎沒有錯誤，我跑：

$ cat t104.cu 
#include <stdio.h> 
#include <math.h> 
#include <cstdio> 
#include <cstdlib> 

const int d1 = 30; 
const int d2 = 784; 
const int d3 = 100; 

double arr1[d1][d2]; 
double arr2[d2][d3]; 
double result[d1][d3]; 


#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } 

inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol) 
{ 
    int col = threadIdx.x + blockDim.x*blockIdx.x; 
    int row= threadIdx.y + blockDim.y*blockIdx.y; 

    if ((row >= rrow) || (col >= rcol)) return; 

    double value = 0; 
    double *result_matrix; 

    result_matrix = ((double*)((char*)results_d + row*result_pitch) + col); 

    for(int i =0 ; i < dim ; i++) 

    { 

    double *element1 = ((double*)((char*)input + i*in_pitch) + col) ; 
    double *element2 = ((double*)((char*)weights + row*w1_pitch) + i); 

    value += (*element1) * (*element2); 

    } 

    *result_matrix = value; 

} 





int main() 
{ 


    for (int i = 0 ; i < d1; i++) 

    { 
    for(int j =0;j <d2 ; j ++) 
     arr1[i][j] = 5; 

    } 

    for (int i =0 ; i < d2; i ++) 
    { 

    for(int j=0;j < d3 ; j++) 
     arr2[i][j] = 3; 

    } 



    double *input; 
    double *weights; 
    double *results_d; 

    size_t in_pitch,w1_pitch,result_pitch; 



//allocating memory in GPU for 2 inputs and result 
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2)); 
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1)); 
    CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1)); 

//Copy matrix from host to device 
    CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice)); 
    CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice)); 
    CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice)); 


//using GPU 


    dim3 dimBlock(32,32,1); 
    dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1); 
    MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3); 

//copying back to host 
    CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost)); 


//printing and seeing whether the result matrix has been updated 
    for (int i =0 ; i < d3; i ++) 
    { 

    for(int j=0;j < d1 ; j++) 
    { 
     printf("%f", result[j][i]); 

    } 
    printf("\n"); 
    } 

    CUDA_SAFE_CALL(cudaFree(input)); 
    CUDA_SAFE_CALL(cudaFree(weights)); 
    CUDA_SAFE_CALL(cudaFree(results_d)); 


    return 0; 
} 
$ nvcc -arch=sm_61 -o t104 t104.cu 
$

來源

2017-04-22 15:19:45

CUDA_SAFE_CALL：遇到非法內存訪問

回答

相關問題