我從the book「CUDA通過示例學習CUDA」。第4章中有一個生成Julia分形的演示。該演示展示了CPU和GPU版本。我決定爲兩種情況增加一段時間來查看執行速度,並且令我驚喜的是發現CPU版本比GPU執行速度快3倍。CUDA內核與Julia集CPU版本的性能下降
CPU朱生成總時間:
745毫秒。
GPU朱代總時間:
2456毫秒。
那麼是怎麼回事?很明顯,至少從CUDA內核代碼來看,執行是平行的,分佈在1000個塊中,每個塊計算1000x1000分辨率最終圖像的像素。
這裏是執行的源代碼:
#define N 10
#define DIM 1000
typedef unsigned char byte;
struct cuComplex {
float r;
float i;
__host__ __device__ cuComplex(float a, float b) : r(a), i(b) {}
__host__ __device__ float magnitude2(void) {
return r * r + i * i;
}
__host__ __device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__host__ __device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r+a.r, i+a.i);
}
};
__device__ int juliaGPU(int x , int y){
const float scale =1.3;
float jx = scale * (float)(DIM/2 -x)/(DIM/2);
float jy= scale *(float)(DIM/2 -y)/(DIM/2);
cuComplex c(-0.8 ,0.156);
cuComplex a(jx ,jy);
int i = 0;
for(i=0; i <200;i++){
a = a * a +c;
if(a.magnitude2() >1000){
return 0;
}
}
return 1;
}
__global__ void kernelGPU(byte *ptr){
int x = blockIdx.x;
int y = blockIdx.y;
int offset =x + y * gridDim.x;
int juliaValue =juliaGPU(x , y);
ptr[offset * 4 + 0]=255 * juliaValue;
ptr[offset * 4 + 1]=0;
ptr[offset * 4 + 2]=0;
ptr[offset * 4 + 3]=255 ;
}
struct DataBlock {
unsigned char *dev_bitmap;
};
void juliaGPUTestSample(){
DataBlock data;
CPUBitmap bitmap(DIM,DIM);
byte *dev_bitmap; //memory on GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap , bitmap.image_size()));
data.dev_bitmap =dev_bitmap;
dim3 grid(DIM,DIM);
int starTime=glutGet(GLUT_ELAPSED_TIME);
kernelGPU<<<grid ,1 >>>(dev_bitmap);
HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr() , dev_bitmap ,bitmap.image_size() ,cudaMemcpyDeviceToHost));
int endTime=glutGet(GLUT_ELAPSED_TIME)-starTime;
printf("Total time %d\n:" ,endTime);
HANDLE_ERROR(cudaFree(dev_bitmap));
bitmap.display_and_exit();
}
int main(void){
juliaGPUTestSample();
return 1;
}
這裏是CPU版本:
///的 「cuComplex」 結構是從上方一樣。
int julia (int x , int y){
const float scale = 1.3;
float jx = scale * (float)(DIM/2 -x)/(DIM/2);
float jy = scale * (float)(DIM/2 -y)/(DIM/2);
cuComplex c(-0.8 ,0.156);
cuComplex a(jx ,jy);
int i = 0;
for(i=0; i <200;i++){
a = a * a +c;
if(a.magnitude2() >1000){
return 0;
}
}
return 1;
}
void kernel(unsigned char *ptr){
for(int y = 0 ; y <DIM ;++y){
for(int x = 0 ; x <DIM ; ++x){
int offset =x + y * DIM;
int juliaValue = julia(x , y);
ptr[offset * 4 + 0 ] = juliaValue * 125;
ptr[offset * 4 + 1 ] = juliaValue * x;
ptr[offset * 4 + 2 ] = juliaValue * y;
ptr[offset * 4 + 3 ] = 255 ;
}
}
}
void juliaCPUTestSample(){
CPUBitmap bitmap(DIM ,DIM);
unsigned char *ptr = bitmap.get_ptr();
int starTime=glutGet(GLUT_ELAPSED_TIME);
kernel(ptr);
int endTime=glutGet(GLUT_ELAPSED_TIME)-starTime;
printf("Total time %d\n:" ,endTime);
bitmap.display_and_exit();
}
更新 - 系統配置:
64位Windows 7
CPU - 英特爾酷睿i7 -3770CPU 3.40GHz,16GB RAM
GPU - 的NVIDIA Quadro 4000
您沒有顯示CPU樣本?我們如何解釋? – 2013-02-17 10:35:56
你是對的!有一刻...... – 2013-02-17 10:39:03