2013-11-09 109 views
0

我是cuda編程新手。在我的程序中(矩陣乘法使用共享內存),我定義了block_size = 20,當矩陣爲1200 * 1200時,程序使用雙元素,但它不適用於浮點元素(元素浮點時與840 * 840矩陣一起使用)。我的問題是,爲什麼會發生這種情況,儘管我們知道float類型比double小?cuda編程中float和double類型有什麼區別?

// Matrices are stored in row-major order: 
// M(row, col) = *(M.elements + row * M.stride + col) 
#include <stdio.h> 
#define BLOCK_SIZE 20 
typedef struct { 
int width; 
int height; 
int stride; 
float* elements; 
} Matrix; 
// Get a matrix element 
__device__ float GetElement(const Matrix A, int row, int col) 
{ 
return A.elements[row * A.stride + col]; 
} 
// Set a matrix element 
__device__ void SetElement(Matrix A, int row, int col, 
float value) 
{ 
A.elements[row * A.stride + col] = value; 
} 
// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is 
// located col sub-matrices to the right and row sub-matrices down 
// from the upper-left corner of A 
__device__ Matrix GetSubMatrix(Matrix A, int row, int col) 
{ 
Matrix Asub; 

Asub.width = BLOCK_SIZE; 
Asub.height = BLOCK_SIZE; 
Asub.stride = A.stride; 
Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col]; 
return Asub; 
} 
// Thread block size 
// Forward declaration of the matrix multiplication kernel 
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix); 
// Matrix multiplication - Host code 
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE 
void MatMul(const Matrix A, const Matrix B, Matrix C) 
{ 

// Load A and B to device memory 
Matrix d_A; 
d_A.width = d_A.stride = A.width; d_A.height = A.height; 
siz e_t size = A.width * A.height * sizeof(float); 
cudaMalloc((void **)&d_A.elements, size); 
cudaMemcpy(d_A.elements, A.elements, size, 
cudaMemcpyHostToDevice); 
Matrix d_B; 
d_B.width = d_B.stride = B.width; d_B.height = B.height; 
size = B.width * B.height * sizeof(float); 
cudaMalloc((void **)&d_B.elements, size); 
cudaMemcpy(d_B.elements, B.elements, size, 
cudaMemcpyHostToDevice); 
// Allocate C in device memory 
Matrix d_C; 
d_C.width = d_C.stride = C.width; d_C.height = C.height; 
size = C.width * C.height * sizeof(float); 
cudaMalloc((void **)&d_C.elements, size); 
// Invoke kernel 
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE); 
//dim3 dimBlock(C.height, C.width); 
//dim3 dimGrid(B.width/dimBlock.x, A.height/dimBlock.y); 
dim3 dimGrid((B.width+dimBlock.x-1)/dimBlock.x, (A.height+dimBlock.y-1) /dimBlock.y); 
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); 
// Read C from device memory 
cudaMemcpy(C.elements, d_C.elements, size, 
cudaMemcpyDeviceToHost); 
// Free device memory 
cudaFree(d_A.elements); 
cudaFree(d_B.elements); 
cudaFree(d_C.elements); 
} 
// Matrix multiplication kernel called by MatMul() 
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) 
{ 
// Block row and column 
int blockRow = blockIdx.y; 
int blockCol = blockIdx.x; 
// Each thread block computes one sub-matrix Csub of C 
Matrix Csub = GetSubMatrix(C, blockRow, blockCol); 
// Each thread computes one element of Csub 
// by accumulating results into Cvalue 
float Cvalue = 0; 
// Thread row and column within Csub 
int row = threadIdx.y; 
int col = threadIdx.x; 
// Loop over all the sub-matrices of A and B that are 
// required to compute Csub 
// Multiply each pair of sub-matrices together 
// and accumulate the results 
for (int m = 0; m < (A.width/BLOCK_SIZE); ++m) { 
// Get sub-matrix Asub of A 
Matrix Asub = GetSubMatrix(A, blockRow, m); 
// Get sub-matrix Bsub of B 
Matrix Bsub = GetSubMatrix(B, m, blockCol); 
// Shared memory used to store Asub and Bsub respectively 
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; 
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; 
// Load Asub and Bsub from device memory to shared memory 
// Each thread loads one element of each sub-matrix 
As[row][col] = GetElement(Asub, row, col); 
Bs[row][col] = GetElement(Bsub, row, col); 
// Synchronize to make sure the sub-matrices are loaded 
// before starting the computation 
__syncthreads(); 
// Multiply Asub and Bsub together 
for (int e = 0; e < BLOCK_SIZE; ++e) 
Cvalue += As[row][e] * Bs[e][col]; 
// Synchronize to make sure that the preceding 
// computation is done before loading two new 
// sub-matrices of A and B in the next iteration 
__syncthreads(); 
} 
// Write Csub to device memory 
// Each thread writes one element 
SetElement(Csub, row, col, Cvalue); 
} 
////////////////////////////////////////////////////////// 
/// print_matrix function /////////////////////////// 
//////////////////////////////////////////////////////// 
void print_matrix(float *c,int row,int col){ 
for (int i = 0; i < row; ++i){ 
for (int j = 0; j < col; ++j) 
printf("%f ",c[col*i +j]); 
printf("\n\n"); 
} 
} 
////////////////////////////////////////////////////////// 
/// random_init function /////////////////////////// 
//////////////////////////////////////////////////////// 
void random_init(float *a,int size){ 
for(int i=0;i<size;i++) 
a[i]=rand()%10; 
} 
//////////////////////////////////////////////////////// 
int main(void){ 

//////////////////////////////////////////////////////\|/ 
cudaEvent_t start,stop; 
///////////////////////////////////////////////////////|\ 

Matrix A,B,C; 
A.width=1200; 
A.height=1200;///// 
B.width=1200;///// 
B.height=1200; 
C.width=B.width; 
C.height=A.height; 

size_t size = A.width * A.height * sizeof(float); 
A.elements = (float *)malloc(size); 
//random_init(A.elements,A.width * A.height); 
size = B.width * B.height * sizeof(float); 
B.elements= (float *)malloc(size); 
//random_init(B.elements,B.width * B.height); 
size = C.width * C.height * sizeof(float); 
C.elements= (float *)malloc(size); 
for(int i=0;i<A.width*A.height;i++) 
A.elements[i]=1; 
for(int i=0;i<B.width*B.height;i++) 
B.elements[i]=1; 
printf("matrix A(%d,%d) & matrix B(%d,%d) & matrix C(%d,%d)\n",A.width,A.height,B.width, 
B.height,C.width,C.height); 
//////////////////////////////////////////////////////\|/ 
cudaEventCreate(&start); 
cudaEventCreate(&stop); 
cudaEventRecord(start,0); 
///////////////////////////////////////////////////////|\ 

MatMul(A,B,C); 
//////////////////////////////////////////////////////\|/ 
cudaEventRecord(stop,0); 
cudaEventSynchronize(stop); 
float elapsedTime; 
cudaEventElapsedTime(&elapsedTime,start,stop); 
printf("Time to genreat : %3.5f ms\n",elapsedTime); 
///////////////////////////////////////////////////////|\ 
printf("\nC\n"); 
//print_matrix(C.elements,C.height,C.width); 


printf("C[%d]=%f\n",0,C.elements[0]); 
printf("C[%d]=%f\n",C.width -1,C.elements[C.width-1]); 
printf("C[%d]=%f\n",(C.width * C.height)-1,C.elements[(C.width * C.height)-1]); 

getchar(); 
return(0); 
} 
+0

也許有一個問題,「sizeof(float)vs sizeof(cl_float)」可能是64位,而cl 1的長度必須是32位。特別是如果你的CPU和OS&編譯器是64位的。 –

+0

我的CPU和操作系統都是64位的NVIDIA Geforce gt555M和cuda4.2,我的編譯器是visual studio 2010,但它不能識別cl_float。我應該包括任何特殊的圖書館嗎 –

+3

我編譯並運行了你的尾聲,並使用float和double獲得了相同的結果。我有cuda 5.0和特斯拉k20卡。 –

回答

2

以下消息:

「‘顯示驅動程序停止響應和已恢復’,」

是已碰上windows TDR event的指示。

在windows下,花費太長時間執行的內核將導致windows顯示監視程序計時器重置顯示設備,這將導致CUDA代碼執行被終止。需要超過2秒鐘才能執行的內核可能會遇到這種情況。

如果你在「windows TDR」上搜索,你會發現其他的描述和可能的方法來解決這個問題。您也可以調查爲什麼您的代碼在執行更改後需要更長時間才能執行。

相關問題