我有200個矩陣A [i](其維數爲4096 * 48)和48個向量v [j](維數爲48 * 1)。我想計算A [i] * v [j],(i = 0:199,j = 1:47)。排列網格大小和塊大小
我想到如何安排我的網格大小和塊大小從昨天。但我現在不知道答案。任何人都可以給我一些建議嗎?
每塊的最大數量是512.這是我的工作環境。
以下是我的代碼。它的作品正確。我檢查過。但它比Matlab的慢:(
#include<iostream>
#include <mat.h>
#include <time.h>
#include <cuda_runtime.h>
#include "cuda.h"
using std::cout;
using std::endl;
using namespace cv;
using namespace std;
#include <limits>
#include <iostream>
#include <cstdlib>
using namespace std;
#define kernel_size 48
////////////////////////////////////////////
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(1,B.height);
dim3 dimGrid(A.height, C.width);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockCol = blockIdx.y;
int blockRow = blockIdx.x;
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[1][kernel_size];
__shared__ float Bs[kernel_size][1];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[0][row] = A.elements[blockRow * A.stride + row+B.height*blockCol];
Bs[row][0] = B.elements[row];
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < B.height; ++e)
{
Cvalue += As[0][e] * Bs[e][0];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// Write Csub to device memory
// Each thread writes one element
C.elements[blockRow * C.stride +blockCol]= Cvalue;
}
//////////////////
float * gen_matrix(int n /*row*/, int m /*col*/){
float *A;
//srand(1023);
A = (float *) malloc(n*m*sizeof(float));
for(int row = 0;row < n;row++)
for(int col = 0;col < m;col++) {
A[row*m+col] = rand()%10;
}
/*
// print matrix elements.
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j)
cout << " [" << i << "," << j << "] " << A[i*m+j] ;
cout << endl;
}
*/
return A;
}
int main()
{
int k=kernel_size;
int s=2000;
int m =4096;
//int m=2;
//int s=1;
int n = k*s;
float *Ae = gen_matrix(m,n);
float *Be= gen_matrix(k,1);00
float *Ce=(float *) malloc(m*s*sizeof(float));
Matrix A ={n,m,n,Ae};
Matrix B ={1,k,1,Be};
Matrix C ={s,m,s,Ce};
const clock_t begin_time = clock();
MatMul(A, B, C);
std::cout << float(clock() - begin_time)/CLOCKS_PER_SEC;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j <7; ++j)
cout << " [" << i << "," << j << "] " << Ce[i*m+j] ;
cout << endl;
}
//check
float *Ce2=(float *) malloc(s*m*sizeof(float));
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
Ce2[i*s+j]=0;
}
}
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
for (int ind = 0; ind < k; ind++)
{
Ce2[i*s+j]=Ce2[i*s+j]+Ae[j*k+ind+i*k*s]*Be[ind];
// printf("%f---****%f\n",Ae[j*k+ind+i*k*s],Be[ind]);
}
if (Ce2[i*s+j]!= Ce[i*s+j])
{
printf("%f----%f\n",Ce2[i*s+j],Ce[i*s+j]);
}
}
}
free(Ae);
free(Be);
free(Ce);
}
忘掉Matrix架構,並考慮如何將數據安排在單維數組中。一旦你這樣做,在這種情況下,網格和塊的大小看起來幾乎是任意的。 – 2014-09-29 20:40:40