1
#include <iostream>
#include <assert.h>
#include <sys/time.h>
#define BLOCK_SIZE 32 // CUDA block size
__device__ inline int getValFromMatrix(int* matrix, int row, int col,int matSize) {
if (row<matSize && col<matSize) {return matrix[row*matSize + col];}
return 0;
}
__device__ inline int getValFromVector(int* vector, int row, int matSize) {
if (row<matSize) {return vector[row];}
return 0;
}
__global__ void matVecMultCUDAKernel(int* aOnGPU, int* bOnGPU, int* cOnGPU, int matSize) {
__shared__ int aRowShared[BLOCK_SIZE];
__shared__ int bShared[BLOCK_SIZE];
__shared__ int myRow;
__shared__ double rowSum;
int myIndexInBlock = threadIdx.x;
myRow = blockIdx.x;
rowSum = 0;
for (int m = 0; m < (matSize/BLOCK_SIZE + 1);m++) {
aRowShared[myIndexInBlock] = getValFromMatrix(aOnGPU,myRow,m*BLOCK_SIZE+myIndexInBlock,matSize);
bShared[myIndexInBlock] = getValFromVector(bOnGPU,m*BLOCK_SIZE+myIndexInBlock,matSize);
__syncthreads(); // Sync threads to make sure all fields have been written by all threads in the block to cShared and xShared
if (myIndexInBlock==0) {
for (int k=0;k<BLOCK_SIZE;k++) {
rowSum += aRowShared[k] * bShared[k];
}
}
}
if (myIndexInBlock==0) {cOnGPU[myRow] = rowSum;}
}
static inline void cudaCheckReturn(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr <<"CUDA Runtime Error: " << cudaGetErrorString(result) << std::endl;
assert(result == cudaSuccess);
}
}
static void matVecMultCUDA(int* aOnGPU,int* bOnGPU, int* cOnGPU, int* c, int sizeOfc, int matSize) {
matVecMultCUDAKernel<<<matSize,BLOCK_SIZE>>>(aOnGPU,bOnGPU,cOnGPU,matSize); // Launch 1 block per row
cudaCheckReturn(cudaMemcpy(c,cOnGPU,sizeOfc,cudaMemcpyDeviceToHost));
}
static void matVecMult(int** A,int* b, int* c, int matSize) {
// Sequential implementation:
for (int i=0;i<matSize;i++) {
c[i]=0;
for (int j=0;j<matSize;j++) {
c[i]+=(A[i][j] * b[j]);
}
}
}
int main() {
int matSize = 1000;
int** A,* b,* c;
int* aOnGPU,* bOnGPU,* cOnGPU;
A = new int*[matSize];
for (int i = 0; i < matSize;i++) {A[i] = new int[matSize]();}
b = new int[matSize]();
c = new int[matSize]();
int aSizeOnGPU = matSize * matSize * sizeof(int), bcSizeOnGPU = matSize * sizeof(int);
cudaCheckReturn(cudaMalloc(&aOnGPU,aSizeOnGPU)); // cudaMallocPitch?
cudaCheckReturn(cudaMalloc(&bOnGPU,bcSizeOnGPU));
cudaCheckReturn(cudaMalloc(&cOnGPU,bcSizeOnGPU));
srand(time(NULL));
for (int i=0;i<matSize;i++) {
b[i] = rand()%100;
for (int j=0;j<matSize;j++) {
A[i][j] = rand()%100;
}
}
for (int i=0;i<matSize;i++) {cudaCheckReturn(cudaMemcpy((aOnGPU+i*matSize),A[i],bcSizeOnGPU,cudaMemcpyHostToDevice));}
cudaCheckReturn(cudaMemcpy(bOnGPU,b,bcSizeOnGPU,cudaMemcpyHostToDevice));
int iters=1;
timeval start,end;
// Sequential run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMult(A,b,c,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
// CUDA run:
gettimeofday(&start,NULL);
for (int i=0;i<iters;i++) {matVecMultCUDA(aOnGPU,bOnGPU,cOnGPU,c,bcSizeOnGPU,matSize);}
gettimeofday(&end,NULL);
std::cout << (end.tv_sec*1000000 + end.tv_usec) - (start.tv_sec*1000000 + start.tv_usec) << std::endl;
cudaCheckReturn(cudaFree(aOnGPU));
cudaCheckReturn(cudaFree(bOnGPU));
cudaCheckReturn(cudaFree(cOnGPU));
for (int i = 0; i < matSize; ++i) {
delete[] A[i];
}
delete[] A;
delete[] b;
delete[] c;
}
267171
580253
我跟着指南http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory,如何做一個矩陣乘法。我對矩陣(A)和向量(B)都使用共享內存,但無論選擇什麼樣的矩陣大小(100 * 100-20000 * 20000)或塊大小(32-1024),順序執行總是優於在CUDA實施速度方面,速度大約快兩倍。
由於我使用矩陣*向量乘法,共享數組和塊的處理方式有點不同;我在矩陣的每一行使用一個塊,而不是在矩陣的一部分上使用二維塊。
是我的執行錯誤,或者簡直是CUDA不比CPU快?