這是一個順序的Mandelbrot集合實現。CUDA Mandelbrot集合
void mandelbrot(PGMData *I)
{
float x0,y0,x,y,xtemp;
int i,j;
int color;
int iter;
int MAX_ITER=1000;
for(i=0; i<I->height; i++)
for(j=0; j<I->width; j++)
{
x0 = (float)j/I->width*(float)3.5-(float)2.5;
y0 = (float)i/I->height*(float)2.0-(float)1.0;
x = 0;
y = 0;
iter = 0;
while((x*x-y*y <= 4) && (iter < MAX_ITER))
{
xtemp = x*x-y*y+x0;
y = 2*x*y+y0;
x = xtemp;
iter++;
}
color = (int)(iter/(float)MAX_ITER*(float)I->max_gray);
I->image[i*I->width+j] = I->max_gray-color;
}
}
我想用CUDA將其並列化,但我似乎誤解了一些東西,現在我被卡住了。我試過搜索互聯網,但沒有什麼真正的好消息。
內核:
__global__ void calc(int *pos)
{
int row= blockIdx.y * blockDim.y + threadIdx.y; // WIDTH
int col = blockIdx.x * blockDim.x + threadIdx.x; // HEIGHT
int idx = row * WIDTH + col;
if(col > WIDTH || row > HEIGHT || idx > N) return;
float x0 = (float)row/WIDTH*(float)3.5-(float)2.5;
float y0 = (float)col/HEIGHT*(float)2.0-(float)1.0;
int x = 0, y = 0, iter = 0, xtemp = 0;
while((x*x-y*y <= 4) && (iter < MAX_ITER))
{
xtemp = x*x-y*y+x0;
y = 2*x*y+y0;
x = xtemp;
iter++;
}
int color = 255 - (int)(iter/(float)MAX_ITER*(float)255);
__syncthreads();
pos[idx] = color;//color;// - color;
}
內核啓動是這樣的:
dim3 block_size(16, 16);
dim3 grid_size((N)/block_size.x, (int) N/block_size.y);
calc<<<grid_size,block_size>>>(d_pgmData);
這裏是常量:
#define HEIGHT 512
#define WIDTH 512
#define N (HEIGHT*WIDTH)
整個GPU功能
void mandelbrotGPU(PGMData *I)
{
int *pos = (int *)malloc(HEIGHT*WIDTH*sizeof(int));
int *d_pgmData;
cudaMalloc((void **)&d_pgmData, sizeof(int)*WIDTH*HEIGHT);
cudaMemcpy(d_pgmData, pos ,HEIGHT*WIDTH*sizeof(int) ,cudaMemcpyHostToDevice);
dim3 block_size(16, 16);
dim3 grid_size((N)/block_size.x, (int) N/block_size.y);
calc<<<grid_size,block_size>>>(d_pgmData);
cudaMemcpy(pos,d_pgmData,HEIGHT*WIDTH*sizeof(int) ,cudaMemcpyDeviceToHost);
cudaFree(d_pgmData);
I->image = pos;
}
問題是:它要麼返回垃圾或驅動程序崩潰。我真的很感謝一些建議,因爲我很困難。
嗨,你可以發佈一下你用來構建項目的命令/工具嗎?我真的很感激。謝謝! – user3009269