2016-12-30 41 views
0

我試圖使用cuda從圖像中分離通道。該程序輸出對應於通道的三個圖像。我得到了正確的輸出,但它只顯示了一部分圖像通道。使用cuda從RGBA圖像中分離通道

這是我的代碼:

// main.cpp 
    void separateHelper(const uchar4 *d_rgbaImage, uchar4 *d_channel, const int numRows, const int numCols,int channel); 

    std::string file_name = "test.jpg"; 
    cv::Mat image, rgbaImage; 
    int numRows(){ return rgbaImage.rows; }; 
    int numCols(){ return rgbaImage.cols; }; 

    int main(){ 

    uchar4 *h_rgbaImage, *h_red, *h_green, *h_blue; 
    uchar4 *d_rgbaImage, *d_red, *d_green, *d_blue; 
    cv::Mat red, green, blue; 
    cv::Mat redChannel, greenChannel, blueChannel; 

    image = cv::imread(file_name.c_str(),CV_LOAD_IMAGE_COLOR); 
    if (image.empty()){ 
     std::cerr << "error loading image"; 
     system("pause"); 
     exit(1); 
    } 

    cv::cvtColor(image,rgbaImage, CV_BGR2RGBA); 
    //create space for output 
    red.create(numRows(), numCols(), CV_8UC3); 
    cv::cvtColor(red, redChannel, CV_BGRA2RGBA); 
    green.create(numRows(), numCols(), CV_8UC3); 
    cv::cvtColor(green, greenChannel, CV_BGRA2RGBA); 
    blue.create(numRows(), numCols(), CV_8UC3); 
    cv::cvtColor(blue, blueChannel, CV_BGRA2RGBA); 

    h_rgbaImage = (uchar4*)rgbaImage.ptr<unsigned char>(0); 
    h_red = (uchar4*)redChannel.ptr<unsigned char>(0); 
    h_green = (uchar4*)greenChannel.ptr<unsigned char>(0); 
    h_blue = (uchar4*)blueChannel.ptr<unsigned char>(0); 

    //allocate memory on device 
    const int numPixels = numCols()*numRows(); 
    checkCudaErrors(cudaMalloc((void**)&d_rgbaImage,sizeof(uchar4) * (numPixels + 500))); 
    checkCudaErrors(cudaMalloc((void**)&d_red, sizeof(uchar4) * (numPixels + 500))); 
    checkCudaErrors(cudaMalloc((void**)&d_green, sizeof(uchar4) * (numPixels + 500))); 
    checkCudaErrors(cudaMalloc((void**)&d_blue, sizeof(uchar4) * (numPixels + 500))); 

    //copy image from host to device 
    checkCudaErrors(cudaMemcpy(d_rgbaImage, h_rgbaImage, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice)); 

    //call helper function of kernel 
    separateHelper(d_rgbaImage, d_red, numRows(), numCols(),1); 
    separateHelper(d_rgbaImage, d_green, numRows(), numCols(),2); 
    separateHelper(d_rgbaImage, d_blue, numRows(), numCols(),3); 

    //copy results back to host 
    checkCudaErrors(cudaMemcpy(h_red, d_red, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); 
    checkCudaErrors(cudaMemcpy(h_green, d_green, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); 
    checkCudaErrors(cudaMemcpy(h_blue, d_blue, sizeof(uchar4) * numPixels, cudaMemcpyDeviceToHost)); 

    //change RGBA to BGR 
    cv::cvtColor(redChannel,red,CV_RGBA2BGR); 
    cv::cvtColor(greenChannel,green,CV_RGBA2BGR); 
    cv::cvtColor(blueChannel,blue,CV_RGBA2BGR); 

    cv::namedWindow("RED", CV_WINDOW_AUTOSIZE); 
    cv::imshow("RED", red); 
    cv::namedWindow("GREEN", CV_WINDOW_AUTOSIZE); 
    cv::imshow("GREEN", green); 
    cv::namedWindow("BLUE", CV_WINDOW_AUTOSIZE); 
    cv::imshow("BLUE", blue); 
    cv::waitKey(0); 

    cudaFree(d_rgbaImage); 
    cudaFree(d_red); 
    cudaFree(d_green); 
    cudaFree(d_blue); 
    return 0; 
} 

這是我的GPU代碼:

// kernel.cu 
__global__ void separateChannels(const uchar4* d_rgbaImage,uchar4* d_channel, int numRows, int numCols, int channel){ 
    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 
    if (x >= numCols || y >= numRows) 
     return; 
    int index = numRows * y + x; 
    if (channel == 1){ 
     d_channel[index].x = d_rgbaImage[index].x; 
     d_channel[index].y = 0; 
     d_channel[index].z = 0; 
    } 
    else if (channel == 2){ 
     d_channel[index].x = 0; 
     d_channel[index].y = d_rgbaImage[index].y; 
     d_channel[index].z = 0; 
    } 
    else if (channel == 3){ 
     d_channel[index].x = 0; 
     d_channel[index].y = 0; 
     d_channel[index].z = d_rgbaImage[index].z; 
    } 
    d_channel[index].w = 255; 
} 

void separateHelper(const uchar4 *d_rgbaImage, uchar4 *d_channel, 
    const int numRows, const int numCols, int channel){ 


    //set grid and block size 
    int blockWidth = 32; 
    const dim3 blockSize(blockWidth, blockWidth, 1); 
    const dim3 gridSize(((numCols)/32 + 1), ((numRows)/32 + 1), 1); 
    //call kernel 
    separateChannels <<<gridSize, blockSize >>>(d_rgbaImage, d_channel, numRows, numCols, channel); 

    cudaDeviceSynchronize(); 
    checkCudaErrors(cudaGetLastError()); 
}   

錯誤:只有圖像(紅色,綠色和藍色通道圖像)的一部分被示出爲輸出。

+0

,我會建議您儘可能避免使用條件語句中的內核函數。 SM在運行期間將執行每個分支,這肯定會降低您的性能。 – pSoLT

+1

@pSoLT:該內核中沒有發散的代碼路徑。通道是運行時常量。內核基本上是最優的,只要它做到了。 – talonmies

+0

@talonmies我同意,我的壞 - 我沒有仔細閱讀:)但它不會傷害將通道作爲int3傳遞並避免所有這些條件。 – pSoLT

回答

1

我假設它沒有足夠的線程分配執行任務,或者你混淆了x和y座標。一般而言,y方向的條紋分配有列,x方向的條紋分配有行。每行包含numColumns元素,每列包含numRows元素。當您分配線程您按照邏輯:

int blockWidth = 32; 
const dim3 blockSize(blockWidth, blockWidth, 1); 
const dim3 gridSize(((numCols)/32 + 1), ((numRows)/32 + 1), 1); 

但是當你計算索引你不知道。不應

int index = numRows * y + x; 

是:

int index = numColumns * y + x; 

+0

內核中也有(可能)內存訪問越界 – talonmies

+0

如果參數定義良好,OP代碼中沒有越界問題,因爲早期檢查確保x和y值小於圖像的最大分辨率尺寸。在某些時候,一個線程總是會出界,但是OP看起來已經被覆蓋了。 – Andrew

-1

我很激動,我剛剛解決了我的問題!

我的情況是,C++代碼結果是正確的,但GPU代碼結果只顯示完整圖像的四分之一。 這是因爲當從設備到主機的cudaMemcpy時,我設置了錯誤的參數「大小」。

// cudaMemcpy(h_result,d_result,imagesize,cudaMemcpyDeviceToHost);

// cudaMemcpy(h_result,d_result,imagesize * sizeof(float),cudaMemcpyDeviceToHost);

sizeof(float)正好是4個字節!所以我只有四分之一的完整形象。

記得乘以sizeof(數據類型)。

希望我的回答是有用的:)