2014-09-28 192 views
0

我有200個矩陣A [i](其維數爲4096 * 48)和48個向量v [j](維數爲48 * 1)。我想計算A [i] * v [j],(i = 0:199,j = 1:47)。排列網格大小和塊大小

我想到如何安排我的網格大小和塊大小從昨天。但我現在不知道答案。任何人都可以給我一些建議嗎?

每塊的最大數量是512.這是我的工作環境。 enter image description here

以下是我的代碼。它的作品正確。我檢查過。但它比Matlab的慢:(

#include<iostream> 
#include <mat.h> 
#include <time.h> 
#include <cuda_runtime.h> 
#include "cuda.h" 

using std::cout; 
using std::endl; 
using namespace cv; 
using namespace std; 

#include <limits> 
#include <iostream> 
#include <cstdlib> 
using namespace std; 

#define kernel_size 48 

//////////////////////////////////////////// 

typedef struct { 
    int width; 
    int height; 
    int stride; 
    float* elements; 
} Matrix; 



// Forward declaration of the matrix multiplication kernel 
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix); 
// Matrix multiplication - Host code 
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE 
void MatMul(const Matrix A, const Matrix B, Matrix C) 
{ 
    // Load A and B to device memory 
    Matrix d_A; 
    d_A.width = d_A.stride = A.width; d_A.height = A.height; 
    size_t size = A.width * A.height * sizeof(float); 
    cudaMalloc(&d_A.elements, size); 
    cudaMemcpy(d_A.elements, A.elements, size, 
     cudaMemcpyHostToDevice); 
    Matrix d_B; 
    d_B.width = d_B.stride = B.width; d_B.height = B.height; 
    size = B.width * B.height * sizeof(float); 
    cudaMalloc(&d_B.elements, size); 
    cudaMemcpy(d_B.elements, B.elements, size, 
     cudaMemcpyHostToDevice); 
    // Allocate C in device memory 
    Matrix d_C; 
    d_C.width = d_C.stride = C.width; d_C.height = C.height; 
    size = C.width * C.height * sizeof(float); 
    cudaMalloc(&d_C.elements, size); 
    // Invoke kernel 
    dim3 dimBlock(1,B.height); 
    dim3 dimGrid(A.height, C.width); 
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); 
    // Read C from device memory 
    cudaMemcpy(C.elements, d_C.elements, size, 
     cudaMemcpyDeviceToHost); 
    // Free device memory 
    cudaFree(d_A.elements); 
    cudaFree(d_B.elements); 
    cudaFree(d_C.elements); 
} 
// Matrix multiplication kernel called by MatMul() 
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) 
{ 
    // Block row and column 
    int blockCol = blockIdx.y; 
    int blockRow = blockIdx.x; 

    float Cvalue = 0; 
    // Thread row and column within Csub 
    int row = threadIdx.y; 
    int col = threadIdx.x; 
    // Loop over all the sub-matrices of A and B that are 
    // required to compute Csub 
    // Multiply each pair of sub-matrices together 
    // and accumulate the results 


    // Shared memory used to store Asub and Bsub respectively 
    __shared__ float As[1][kernel_size]; 
    __shared__ float Bs[kernel_size][1]; 
    // Load Asub and Bsub from device memory to shared memory 
    // Each thread loads one element of each sub-matrix 


    As[0][row] = A.elements[blockRow * A.stride + row+B.height*blockCol]; 
    Bs[row][0] = B.elements[row]; 
    // Synchronize to make sure the sub-matrices are loaded 
    // before starting the computation 
    __syncthreads(); 
    // Multiply Asub and Bsub together 
    for (int e = 0; e < B.height; ++e) 
    { 
     Cvalue += As[0][e] * Bs[e][0]; 

    } 
    // Synchronize to make sure that the preceding 
    // computation is done before loading two new 
    // sub-matrices of A and B in the next iteration 
    __syncthreads(); 

    // Write Csub to device memory 
    // Each thread writes one element 
    C.elements[blockRow * C.stride +blockCol]= Cvalue; 
} 

////////////////// 





float * gen_matrix(int n /*row*/, int m /*col*/){ 

    float *A; 
    //srand(1023); 
    A = (float *) malloc(n*m*sizeof(float)); 

    for(int row = 0;row < n;row++) 
     for(int col = 0;col < m;col++) { 
      A[row*m+col] = rand()%10; 
     } 

     /* 
     // print matrix elements. 
     for (int i = 0; i < n; ++i) { 
     for (int j = 0; j < m; ++j) 
     cout << " [" << i << "," << j << "] " << A[i*m+j] ; 
     cout << endl; 
     } 
*/ 
     return A; 
} 



int main() 
{ 
    int k=kernel_size; 
    int s=2000; 
    int m =4096; 
    //int m=2; 
    //int s=1; 
    int n = k*s; 
    float *Ae = gen_matrix(m,n); 
    float *Be= gen_matrix(k,1);00 
    float *Ce=(float *) malloc(m*s*sizeof(float)); 

    Matrix A ={n,m,n,Ae}; 
    Matrix B ={1,k,1,Be}; 
    Matrix C ={s,m,s,Ce}; 

    const clock_t begin_time = clock(); 
    MatMul(A, B, C); 
    std::cout << float(clock() - begin_time)/CLOCKS_PER_SEC; 

    for (int i = 0; i < 3; ++i) { 
     for (int j = 0; j <7; ++j) 
      cout << " [" << i << "," << j << "] " << Ce[i*m+j] ; 
     cout << endl; 
    } 


    //check 
    float *Ce2=(float *) malloc(s*m*sizeof(float)); 
    for (int i = 0; i < m; i++) 
    { 
     for (int j = 0; j < s; j++) 
     { 
      Ce2[i*s+j]=0; 
     } 
    } 
    for (int i = 0; i < m; i++) 
    { 
     for (int j = 0; j < s; j++) 
     { 
      for (int ind = 0; ind < k; ind++) 
      { 
       Ce2[i*s+j]=Ce2[i*s+j]+Ae[j*k+ind+i*k*s]*Be[ind]; 
      // printf("%f---****%f\n",Ae[j*k+ind+i*k*s],Be[ind]); 
      } 
      if (Ce2[i*s+j]!= Ce[i*s+j]) 
      { 
       printf("%f----%f\n",Ce2[i*s+j],Ce[i*s+j]); 
      } 

     } 

    } 





    free(Ae); 
    free(Be); 
    free(Ce); 
} 
+1

忘掉Matrix架構,並考慮如何將數據安排在單維數組中。一旦你這樣做,在這種情況下,網格和塊的大小看起來幾乎是任意的。 – 2014-09-29 20:40:40

回答

1

這僅僅是一個矩陣,矩陣乘法問題。如果你想要的東西跑得快,你不應該寫自己的矩陣乘法代碼。使用CUBLAS SGEMM。

從概念上講,如果你安排你的A矩陣是這樣的:

[A0] 
[A1] 
[A2] 
... 
[A199] 

那麼你將有一個新的矩陣AA是(4096 * 200)行×48列

安排您48個V載體(48x1)在一個48×48矩陣(VV):

[V0][V1][V2]...[V47] 

(各V向量是新的矩陣VV列)

您現在有一個單一的矩陣乘法問題(AA * VV)即(4096 * 200)x48乘以48x48得出(4096 * 200)x48的結果。這個結果有一個長度爲4096 * 200的列向量,其中包含您試圖執行的單個矩陣向量乘法的200個結果。每列* 48列的200個結果結合起來,可以爲您提供原始問題所產生的所有結果。第一列將包含的[V0]乘以每個200點A矩陣的結果,第二列將包含的[V1]乘以每個200點A矩陣的結果等

一旦你安排你這樣的數據,使用CUBLAS Sgemm應該是GPU上最快的方法。請注意,CUBLAS預計底層存儲是專欄,所以如果您要重新整理數據,您可能需要牢記這一點。有一個CUDA sample code for CUBLAS matrix multiplication

在你的代碼中看起來你實際上有2000 A矩陣,但是你的問題指的是200.我在我的答案中使用了200,但是這個概念和2000 A矩陣一樣。