CUDA Mandelbrot集合

這是一個順序的Mandelbrot集合實現。CUDA Mandelbrot集合

void mandelbrot(PGMData *I) 
{ 
    float x0,y0,x,y,xtemp; 
    int i,j; 
    int color; 
    int iter; 
    int MAX_ITER=1000; 
    for(i=0; i<I->height; i++) 
     for(j=0; j<I->width; j++) 
     { 
      x0 = (float)j/I->width*(float)3.5-(float)2.5; 
      y0 = (float)i/I->height*(float)2.0-(float)1.0; 
      x = 0; 
      y = 0; 
      iter = 0; 
      while((x*x-y*y <= 4) && (iter < MAX_ITER)) 
      { 
       xtemp = x*x-y*y+x0; 
       y = 2*x*y+y0; 
       x = xtemp; 
       iter++; 
      } 
      color = (int)(iter/(float)MAX_ITER*(float)I->max_gray); 
      I->image[i*I->width+j] = I->max_gray-color; 
     } 
}

我想用CUDA將其並列化，但我似乎誤解了一些東西，現在我被卡住了。我試過搜索互聯網，但沒有什麼真正的好消息。

內核：

__global__ void calc(int *pos) 
{ 
    int row= blockIdx.y * blockDim.y + threadIdx.y; // WIDTH 
    int col = blockIdx.x * blockDim.x + threadIdx.x; // HEIGHT 
    int idx = row * WIDTH + col; 

    if(col > WIDTH || row > HEIGHT || idx > N) return; 

    float x0 = (float)row/WIDTH*(float)3.5-(float)2.5; 
    float y0 = (float)col/HEIGHT*(float)2.0-(float)1.0; 

    int x = 0, y = 0, iter = 0, xtemp = 0; 
    while((x*x-y*y <= 4) && (iter < MAX_ITER)) 
    { 
     xtemp = x*x-y*y+x0; 
     y = 2*x*y+y0; 
     x = xtemp; 
     iter++; 
    } 
    int color = 255 - (int)(iter/(float)MAX_ITER*(float)255); 
    __syncthreads(); 
    pos[idx] = color;//color;// - color; 

}

內核啓動是這樣的：

dim3 block_size(16, 16); 
dim3 grid_size((N)/block_size.x, (int) N/block_size.y); 
calc<<<grid_size,block_size>>>(d_pgmData);

這裏是常量：

#define HEIGHT 512 
#define WIDTH 512 
#define N (HEIGHT*WIDTH)

整個GPU功能

void mandelbrotGPU(PGMData *I) 
{ 
    int *pos = (int *)malloc(HEIGHT*WIDTH*sizeof(int)); 
    int *d_pgmData; 

    cudaMalloc((void **)&d_pgmData, sizeof(int)*WIDTH*HEIGHT); 


    cudaMemcpy(d_pgmData, pos ,HEIGHT*WIDTH*sizeof(int) ,cudaMemcpyHostToDevice); 

    dim3 block_size(16, 16); 
    dim3 grid_size((N)/block_size.x, (int) N/block_size.y); 
    calc<<<grid_size,block_size>>>(d_pgmData); 

    cudaMemcpy(pos,d_pgmData,HEIGHT*WIDTH*sizeof(int) ,cudaMemcpyDeviceToHost); 
    cudaFree(d_pgmData); 
    I->image = pos; 
}

問題是：它要麼返回垃圾或驅動程序崩潰。我真的很感謝一些建議，因爲我很困難。

來源

2013-11-25 Andro

下面是您的代碼的工作版本（使用OpenCV）：

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include <iostream> 
#include <opencv2/core/core.hpp> 
#include <opencv2/highgui/highgui.hpp> 

using namespace cv; 
using namespace std; 

#define HEIGHT 512 // must be multiple of block_size.y 
#define WIDTH 512 // must be multiple of block_size.x 
#define MAX_ITER 10000 

void mandelbrotGPU(char*); 
__global__ void calc(char* image_buffer); 

#define cudaAssertSuccess(ans) { _cudaAssertSuccess((ans), __FILE__, __LINE__); } 
inline void _cudaAssertSuccess(cudaError_t code, char *file, int line) 
{ 
    if (code != cudaSuccess) { 
    fprintf(stderr,"_cudaAssertSuccess: %s %s %d\n", cudaGetErrorString(code), file, line); 
    exit(code); 
    } 
} 

int main(int argc, char** argv) 
{ 
    IplImage* image_output = cvCreateImage(cvSize(WIDTH, HEIGHT), IPL_DEPTH_8U, 1); 
    mandelbrotGPU(image_output->imageData); 
    cvShowImage("GPU", image_output); 
    waitKey(0); 
    cvReleaseImage(&image_output); 
} 

void mandelbrotGPU(char* image_buffer) 
{ 
    char* d_image_buffer; 
    cudaAssertSuccess(cudaMalloc(&d_image_buffer, WIDTH * HEIGHT)); 
    dim3 block_size(16, 16); 
    dim3 grid_size(WIDTH/block_size.x, HEIGHT/block_size.y); 
    calc<<<grid_size, block_size>>>(d_image_buffer); 
    cudaAssertSuccess(cudaPeekAtLastError()); 
    cudaAssertSuccess(cudaDeviceSynchronize()); 
    cudaAssertSuccess(cudaMemcpy(image_buffer, d_image_buffer, HEIGHT * WIDTH, cudaMemcpyDeviceToHost)); 
    cudaAssertSuccess(cudaFree(d_image_buffer)); 
} 

__global__ void calc(char* image_buffer) 
{ 
    int row = blockIdx.y * blockDim.y + threadIdx.y; // WIDTH 
    int col = blockIdx.x * blockDim.x + threadIdx.x; // HEIGHT 
    int idx = row * WIDTH + col; 
    if(col >= WIDTH || row >= HEIGHT) return; 

    float x0 = ((float)col/WIDTH) * 3.5f - 2.5f; 
    float y0 = ((float)row/HEIGHT) * 3.5f - 1.75f; 

    float x = 0.0f; 
    float y = 0.0f; 
    int iter = 0; 
    float xtemp; 
    while((x * x + y * y <= 4.0f) && (iter < MAX_ITER)) 
    { 
    xtemp = x * x - y * y + x0; 
    y = 2.0f * x * y + y0; 
    x = xtemp; 
    iter++; 
    } 

    int color = iter * 5; 
    if (color >= 256) color = 0; 
    image_buffer[idx] = color; 
}

輸出：

Mandelbrot output

最重要的變化：

刪除__syncthreads();。該算法不使用其他線程生成的數據，因此不需要同步線程。
刪除將主機緩衝區複製到設備。這是沒有必要的，因爲Mandelbrot算法寫入整個設備緩衝區。
修復了錯誤的網格大小計算。
刪除了主機內存的malloc，因爲結果直接複製到OpenCV映像緩衝區中。
將緩衝區更改爲使用字節而不是整數，當您擁有8位分辨率的單個灰色通道時更方便。
刪除了一些不必要的浮標。當您將整數與浮點數一起使用時，整數會自動提升爲浮點數。
固定兩個問題在曼德爾布羅算法：
- x和y被宣佈爲int s，而他們應該是float秒。
- while循環中的第一個表達式應包含+而不是-。

來源

2013-11-26 04:43:23

嗨，你可以發佈一下你用來構建項目的命令/工具嗎？我真的很感激。謝謝！ – user3009269

只是一些想法來看待：

沒有必要__syncthreads()。你的線程不能相互通信。
無需爲I_WIDTH和I_HEIGHT創建設備內存。你只需將它們作爲值傳遞（而不是指針或引用）。您確實需要設備內存pos。
您需要檢查所有CUDA功能的返回值（例如，cudaMalloc）並確保它們都可以。
當你啓動一個內核時，你的程序可以在GPU完成之前返回。有些情況下您需要等待明確完成;您可以在啓動後通過撥打cudaDeviceSynchronize()來完成此操作。就你而言，你不必因爲你的CUDA memcpy會等到內核完成。

來源

2013-11-26 00:05:15

您的項目4不正確。發佈到同一個流的CUDA操作（它們全部在同一個默認流中）被自動序列化。 'cudaMemcpy'操作直到'calc'內核完成之前纔會開始。如果問題正在等待內核完成，則不需要在啓動後調用'cudaDeviceSynchronize（）'。 –

@Robert Crovella，你是對的。更新。 –

這肯定是不正確的：

dim3 grid_size((N)/block_size.x, (int) N/block_size.y);

這引起了界外訪問的內核。您想要啓動總共WIDTH x HEIGHT線程，其中一個用於圖像中的每個像素。相反，您將啓動N/16 x N/16個線程。

而且好像你在你的內核線程檢查行（應阻止了界外，從錯誤的線程訪問），但它沒有正確表述：

if(col > WIDTH || row > HEIGHT || idx > N) return;

例如，這允許idx = N通過線程檢查，但這並不當通過內核的最後一行寫了一個有效的內存位置：

pos[idx] = color;

你可以修復這個線程檢查有：

if(col >= WIDTH || row >= HEIGHT || idx >= N) return;

其他一些意見：

你沒有做任何proper cuda error checking
你可以看到在你的代碼中的錯誤，如果你有cuda-memcheck

運行

來源

2013-11-26 03:56:12

CUDA Mandelbrot集合

回答

相關問題