2017-04-22 63 views
-1

我正在嘗試在CUDA上進行簡單的矩陣乘法。我知道陣列可以平鋪以傳遞給設備。不過,我正在使用cudaMallocPitch和cudaMemcpy2d來進行乘法運算。在執行下面的代碼時,當我嘗試將結果複製到主機上時,出現「遇到非法內存」的錯誤,我高度讚賞關於哪裏出錯的建議。謝謝!CUDA_SAFE_CALL:遇到非法內存訪問

權重-第一矩陣,暗淡:30x784

輸入 - 第二矩陣,暗淡:784x100

results_d - 導致在設備上(GPU)

結果 - 結果在主機上拷貝

#include <stdio.h> 
#include <math.h> 
#include <cstdio> 
#include <cstdlib> 

#define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) exit(code); 
    } 
} 

__global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch) 
{ 
int row = threadIdx.x; 
int col= threadIdx.y; 
double value; 
double *result_matrix; 

result_matrix = ((double*)((char*)results_d + row*result_pitch + col)); 


printf("%d",threadIdx); 

for(int i =0 ; i < in_pitch ; i++) 

{ 

double *element1 = ((double*)((char*)input + row*in_pitch) + i) ; 
double *element2 = ((double*)((char*)weights + i*w1_pitch) + col); 

value =+ (*element1) * (*element2); 

} 

*result_matrix = value; 

} 





int main() 
{ 

static double arr1[30][784]; 
static double arr2[784][100]; 
static double result[30][100]; 



for (int i = 0 ; i < 30; i++) 

{ 
for(int j =0;j <784 ; j ++) 
arr1[i][j] = 5; 

} 

for (int i =0 ; i < 784; i ++) 
{ 

for(int j=0;j < 100 ; j++) 
arr2[i][j] = 3; 

} 



double *input; 
double *weights; 
double *results_d; 

size_t in_pitch,w1_pitch,result_pitch; 



//allocating memory in GPU for 2 inputs and result 
CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,100*sizeof(double),784)); 
CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,784*sizeof(double),30)); 
CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,100*sizeof(double),30)); 

//Copy matrix from host to device 
CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,100*sizeof(double),100*sizeof(double),784,cudaMemcpyHostToDevice)); 
CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,784*sizeof(double),784*sizeof(double),30,cudaMemcpyHostToDevice)); 
CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,100*sizeof(double),100*sizeof(double),30,cudaMemcpyHostToDevice)); 


//using GPU 


    dim3 dimGrid(1,1,1); 
    dim3 dimBlock(32,32,1); 
    printf("before kernel fucntion"); 
    MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch);  
    printf("after kernel fucntion"); 
    cudaThreadSynchronize(); 

//copying back to host 
CUDA_SAFE_CALL(cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)); 


//printing and seeing whether the result matrix has been updated  
for (int i =0 ; i < 100; i ++) 
{ 

for(int j=0;j < 30 ; j++) 
{ 
printf("%f",result); 

} 
printf("\n"); 
} 

CUDA_SAFE_CALL(cudaFree(input)); 
CUDA_SAFE_CALL(cudaFree(weights)); 
CUDA_SAFE_CALL(cudaFree(results_d)); 


return 0; 
} 
+0

請正確格式化您的代碼。它是完全無法讀取的 – talonmies

+1

請閱讀[文檔](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY_1g32bd7a39135594788a542ae72217775c)查找'cudaMallocPitch'。該函數返回的'pitch'值是**字節**中的一個值。你不能合理地使用它作爲矩陣乘法的循環索引。此外'pitch'給出了整個分配的寬度。您的循環變量應該只遍歷定義的行/列長度(即在這種情況下爲784),您似乎沒有將其傳遞給內核。 –

回答

1

此代碼中有許多錯誤。首先,目前還不清楚,做這樣的分配會給這裏帶來什麼好處。其次,如果你想要快速矩陣乘法性能,你應該使用CUBLAS。

問題:

  1. 你似乎並不瞭解投分配。返回的pitch值是字節中的值。你不能合理地將它用於矩陣乘法的循環索引。而且,pitch值是音高分配的總寬度。它不對應有效的數據區域。爲此,您應該使用適當的矩陣維度。

  2. 您的代碼不會在整個矩陣區域上執行矩陣乘法。您只創建一個32x32線程的單個塊,但您需要足夠的塊/線程來覆蓋整個矩陣區域。這需要對網格維度進行更改,將矩陣維度傳遞給內核,並在內核中進行「線程檢查」以防止出現邊界訪問。

  3. 此構建投訪問是不正確的:

    result_matrix = ((double*)((char*)results_d + row*result_pitch + col)); 
    

    不匹配,你有2點輸入矩陣的其它結構,它有一個錯位的右括號。

  4. 你有兩個輸入矩陣相反的意思。您正在索引到input矩陣,就好像它是weight矩陣,反之亦然。我們需要交換rowcolumni的含義以使它們與實際的矩陣尺寸相匹配。

  5. 你最終cudaMemcpy2D操作的間距值逆轉:

cudaMemcpy2D(result,result_pitch,results_d,100*sizeof(double),100*sizeof(double),30,cudaMemcpyDeviceToHost)

    ^^^^^     ^^^^^ 
  • 你忘了初始化爲零的循環和變量:

    double value; 
    
  • 我不知道你打算在這裏是什麼,應該是+=沒有=+

    value =+ ... 
    
  • 下面的代碼已經解決了這些問題,而且似乎沒有錯誤,我跑:

    $ cat t104.cu 
    #include <stdio.h> 
    #include <math.h> 
    #include <cstdio> 
    #include <cstdlib> 
    
    const int d1 = 30; 
    const int d2 = 784; 
    const int d3 = 100; 
    
    double arr1[d1][d2]; 
    double arr2[d2][d3]; 
    double result[d1][d3]; 
    
    
    #define CUDA_SAFE_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
    
    inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 
    { 
        if (code != cudaSuccess) 
        { 
         fprintf(stderr,"CUDA_SAFE_CALL: %s %s %d\n", cudaGetErrorString(code), file, line); 
         if (abort) exit(code); 
        } 
    } 
    
    __global__ void MatrixMulKernel(double *input,double *weights,double *results_d,size_t in_pitch,size_t w1_pitch,size_t result_pitch, int dim, int rrow, int rcol) 
    { 
        int col = threadIdx.x + blockDim.x*blockIdx.x; 
        int row= threadIdx.y + blockDim.y*blockIdx.y; 
    
        if ((row >= rrow) || (col >= rcol)) return; 
    
        double value = 0; 
        double *result_matrix; 
    
        result_matrix = ((double*)((char*)results_d + row*result_pitch) + col); 
    
        for(int i =0 ; i < dim ; i++) 
    
        { 
    
        double *element1 = ((double*)((char*)input + i*in_pitch) + col) ; 
        double *element2 = ((double*)((char*)weights + row*w1_pitch) + i); 
    
        value += (*element1) * (*element2); 
    
        } 
    
        *result_matrix = value; 
    
    } 
    
    
    
    
    
    int main() 
    { 
    
    
        for (int i = 0 ; i < d1; i++) 
    
        { 
        for(int j =0;j <d2 ; j ++) 
         arr1[i][j] = 5; 
    
        } 
    
        for (int i =0 ; i < d2; i ++) 
        { 
    
        for(int j=0;j < d3 ; j++) 
         arr2[i][j] = 3; 
    
        } 
    
    
    
        double *input; 
        double *weights; 
        double *results_d; 
    
        size_t in_pitch,w1_pitch,result_pitch; 
    
    
    
    //allocating memory in GPU for 2 inputs and result 
        CUDA_SAFE_CALL(cudaMallocPitch((void**)&input,&in_pitch,d3*sizeof(double),d2)); 
        CUDA_SAFE_CALL(cudaMallocPitch((void**)&weights,&w1_pitch,d2*sizeof(double),d1)); 
        CUDA_SAFE_CALL(cudaMallocPitch((void**)&results_d,&result_pitch,d3*sizeof(double),d1)); 
    
    //Copy matrix from host to device 
        CUDA_SAFE_CALL(cudaMemcpy2D(input,in_pitch,arr2,d3*sizeof(double),d3*sizeof(double),d2,cudaMemcpyHostToDevice)); 
        CUDA_SAFE_CALL(cudaMemcpy2D(weights,w1_pitch,arr1,d2*sizeof(double),d2*sizeof(double),d1,cudaMemcpyHostToDevice)); 
        CUDA_SAFE_CALL(cudaMemcpy2D(results_d,result_pitch,result,d3*sizeof(double),d3*sizeof(double),d1,cudaMemcpyHostToDevice)); 
    
    
    //using GPU 
    
    
        dim3 dimBlock(32,32,1); 
        dim3 dimGrid(((d3+dimBlock.x-1)/dimBlock.x),((d1+dimBlock.y-1)/dimBlock.y),1); 
        MatrixMulKernel<<<dimGrid, dimBlock>>>(input, weights,results_d,in_pitch,w1_pitch,result_pitch, d2, d1, d3); 
    
    //copying back to host 
        CUDA_SAFE_CALL(cudaMemcpy2D(result,d3*sizeof(double),results_d,result_pitch,d3*sizeof(double),d1,cudaMemcpyDeviceToHost)); 
    
    
    //printing and seeing whether the result matrix has been updated 
        for (int i =0 ; i < d3; i ++) 
        { 
    
        for(int j=0;j < d1 ; j++) 
        { 
         printf("%f", result[j][i]); 
    
        } 
        printf("\n"); 
        } 
    
        CUDA_SAFE_CALL(cudaFree(input)); 
        CUDA_SAFE_CALL(cudaFree(weights)); 
        CUDA_SAFE_CALL(cudaFree(results_d)); 
    
    
        return 0; 
    } 
    $ nvcc -arch=sm_61 -o t104 t104.cu 
    $