我正在GPU上進行哈里斯角點檢測。我正在觀察CPU性能的異常行爲。調用GPU內核後CPU性能下降
以下是我的Main.cpp文件,如果我通過評論我的核函數(此函數調用GPU的各種核函數)運行此調用"Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);"
我的函數調用讀取下一幀並轉換爲灰度(「cap.read(Masrc )和cvtColor(Masrc,src,CV_BGR2GRAY))平均每幀0.003和0.004秒
令人驚訝的是,當我取消我的GPU內核調用函數「Harris_Algo」相同的CPU函數(cap.read(Masrc)和cvtColor Masrc,src,CV_BGR2GRAY))平均每幀需要0.009秒和0.008秒
由於在我的應用程序中,時機非常關鍵,這種變化正在消除我們獲得的優勢這兩個函數調用與GPU無關,但我調用GPU函數(內核)時仍需要更多時間。
什麼,我認爲是調用我的GPU功能(內核)增加了CPU的開銷,所以它的利用率增加,性能下降。但是這種變化是巨大的。任何其他合適的方法來做到這一點。
任何幫助表示讚賞。
我正在使用Jetson TK1 GPU板。
Main.cpp的文件
#include <iostream>
#include <time.h>
#include <fstream>
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace std;
using namespace cv;
void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response);
void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE);
void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE);
int main(int argc, char** argv)
{
cv::VideoCapture cap(argv[1]);
if (!cap.isOpened())
{
std::cout << "!!! Failed to open file: " << argv[1] << std::endl;
return -1;
}
double time_spent;
clock_t begin3, end3,begin4;
bool start = false;
Mat src;
unsigned char *a,*d_a,*d_g;
int *dx_My,*Res;
int *dy_My;
int *dxdy_My;
int *suppressed;
int *corner_response;
int length;
int Height;
int SIZE;
Size S;
VideoWriter outputVideo;
Mat Masrc;
for(;;)
{
begin4 = clock();
begin3 = clock();
if (!cap.read(Masrc))
break;
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Read Frame : "<<time_spent<<endl;
begin3 = clock();
cvtColor(Masrc, src, CV_BGR2GRAY);
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Gray Convert : "<<time_spent<<endl;
begin3 = clock();
if(start == false)
{
length = src.cols;
Height = src.rows;
cout<<"Width"<<length<<endl;
cout<<"Height"<<Height<<endl;
SIZE = ((length)*(Height));
Res = new int [SIZE];
MemAlloc(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,SIZE);
start = true;
}
a = src.data;
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Initial Processsing Time : "<<time_spent<<endl;
Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);
begin3 = clock();
// imshow("Harris_OUT", Masrc);
// char key = cvWaitKey(1);
// if (key == 27) // ESC
// break;
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Time After Displaying image on Output : "<<time_spent<<endl;
time_spent = (double)(end3 - begin4)/CLOCKS_PER_SEC;
cout<<"Overall Time of entire program exec : "<<time_spent<<endl;
cout<<"-----------------------------------------------------------------------------"<<endl;
}
Cuda_Free(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response);
delete Res;
cvWaitKey(0);
}
Kernal.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
__global__ void Harris_Mat(int *corner_response,int* dx_My,int* dy_My,int* dxdy_My,int rows, int cols,int Size)
{
/*...*/
}
__global__ void Supress_Neighbour(int *input,int *output, int rows, int cols, int Size)
{
/* ... */
}
__global__ void VectorGauss(unsigned char *D, unsigned char *M,int Length, int size_m)
{
float Val;
int i = blockIdx . x * blockDim . x + threadIdx . x;
if(i>0 & i<size_m)
{
if ((i%Length) ==(0) || (i%Length) == (Length-1)|| (i<Length) || (i>(size_m-Length))){
M[i] = 0;
}
Val = ((D[i] +(D[Length+i]) + D[2*Length+i]) +(D[i]+ (D[Length+i])+ D[2*Length+i])
+(D[i+1] + D[i+Length+1] + D[2*Length+i+]));
}
}
__global__ void VectorAdd(unsigned char *D,int* dx,int* dy,int* dxdy,int Length, int size_m)
{
/* ... */
}
__host__ void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE)
{
cudaMalloc (&d_a,SIZE*sizeof(unsigned char));
cudaMalloc (&d_g,SIZE*sizeof(unsigned char));
cudaMalloc (&dx_My,SIZE*sizeof(int));
cudaMalloc (&dy_My,SIZE*sizeof(int));
cudaMalloc (&dxdy_My,SIZE*sizeof(int));
cudaMalloc (&suppressed,SIZE*sizeof(int));
cudaMalloc (&corner_response,SIZE*sizeof(int));
}
__host__ void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE)
{
double time_spent;
clock_t begin3, end3;
begin3 = clock();
cudaMemcpy(d_a,a,SIZE*sizeof(unsigned char),cudaMemcpyHostToDevice);
VectorGauss<<< SIZE/512+1,512>>>(d_a,d_g,length,SIZE);
VectorAdd<<< SIZE/512+1,512>>>(d_g,dx_My,dy_My,dxdy_My,length,SIZE);
Harris_Mat<<< SIZE/512+1,512>>>(corner_response,dx_My,dy_My,dxdy_My,Height,length,SIZE);
Supress_Neighbour<<< SIZE/512+1,512>>>(corner_response, suppressed,Height, length, SIZE);
cudaMemcpy(Res,suppressed,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Processsing Time of Algorithm : "<<time_spent<<endl;
}
__host__ void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response)
{
cudaFree(d_a);
cudaFree(d_g);
cudaFree(dx_My);
cudaFree(dy_My);
cudaFree(dxdy_My);
cudaFree(corner_response);
cudaFree(suppressed);
}
我有使用NVCC編譯並還使用(NVCC和g ++)兩者,但同樣的結果。
運行使用
g++-4.8 -c Main.cpp
nvcc -c Kernal.cu
g++-4.8 -o Output Main.o Kernal.o -L/usr/local/cuda/lib -lcudart -lcuda `pkg-config opencv --cflags --libs`
我認爲你的CUDA內存分配正在創建開銷。我認爲,因爲你只靜態定義一次內存,並且只清除一次。爲什麼不嘗試每個幀的cudaMalloc和cudaFree(在函數Harris_Algo中),因爲在算法中以前的信息不是必需的。 –
是的,CUDA內存分配是很昂貴的,這是我只做一次處理整個視頻幀的原因。如果我爲每一幀做它,那麼我的開銷將是巨大的。 我也試過,但總體時間會更多,如果我們爲每一幀分配內存。 –
你的整個時間測量方法是錯誤的。請閱讀clock()的手冊頁。你*不能*使用時鐘來按你所做的方式計時。CPU秒和秒不是一回事 – talonmies