我們已經給出了使用openmp優化結構不良的程序的任務。我是很新的節目,所以我希望每個人都可以闡明如何優化該功能的一些光(許多我需要優化的一個):對函數的OpenMP優化
void
entry_type3(F2D *sData, F2D *ones, F2D *quat, F2D **pos, F2D **vel)
{
//Observation
F2D *t;
t = fSetArray(1, 3, 0);
asubsref(t,2) = -9.8;
F2D *accl = fDeepCopyRange(sData, 0, 1, 0, 3);
F2D *gtemp = fMtimes(ones, t);
F2D *gravity = quatRot(gtemp, quat);
fFreeHandle(gtemp);
fFreeHandle(t);
t = fSetArray(3,3,0);
asubsref(t,0) = 1;
asubsref(t,4) = 1;
asubsref(t,8) = 1;
int n = ones->height;
int i;
for(i=0; i<(t->height*t->width); i++)
asubsref(t,i) = asubsref(t,i)/STDDEV_ACCL;
F2D *w = mcl(gravity, accl, t);
generateSample(w, quat, *vel, *pos);
fFreeHandle(t);
//Motion model
t = fMtimes(ones, accl);
fFreeHandle(accl);
accl = fMinus(t, gravity);
fFreeHandle(w);
fFreeHandle(gravity);
fFreeHandle(t);
F2D *is;
#pragma omp parallel sections
{
#pragma omp section
{
F2D *is = quatConj(quat);
F2D *s = quatRot(*vel, is);
fFreeHandle(is);
for(i=0; i<(s->height*s->width); i++)
{
asubsref(s,i) = asubsref(s,i)*acclTimeInterval;
}
is = fPlus(*pos, s);
fFreeHandle(*pos);
*pos = fDeepCopy(is);
fFreeHandle(is);
fFreeHandle(s);
}
/** pos_ above stores: pos+quatRot(vel,quatConj(quat))*acclTimeInterval **/
#pragma omp section
{
F2D *is = quatConj(quat);
F2D *s = quatRot(accl, is);
F2D* t = fDeepCopy(s);
for(i=0; i<(s->height*s->width); i++)
{
asubsref(t,i) = 1/2*asubsref(s,i)*acclTimeInterval*acclTimeInterval;
}
/** t_ above stores: 1/2*quatRot(accl,quatCong(quat))*acclTimeInterval^2 **/
fFreeHandle(s);
fFreeHandle(is);
s = randnWrapper(n,3);
for(i=0; i<(s->height*s->width); i++)
{
asubsref(s,i) = asubsref(s,i) * M_STDDEV_POS;
}
/** s_ above stores: randn(n,3)*M_STDDEV_POS **/
is = fPlus(*pos, t);
fFreeHandle(*pos);
*pos = fPlus(is, s);
fFreeHandle(s);
fFreeHandle(t);
fFreeHandle(is);
}
}
//vel=vel+accl*acclTimeInterval+randn(n,3)*M_STDDEV_VEL;
#pragma omp parallel sections
{
#pragma omp section
{
F2D *t = fDeepCopy(accl);
#pragma omp parallel for
for(i=0; i<(accl->height*accl->width); i++)
{
asubsref(t,i) = asubsref(accl,i) * acclTimeInterval;
}
is = fPlus(*vel, t);
fFreeHandle(accl);
fFreeHandle(t);
}
#pragma omp section
{
F2D *s = randnWrapper(n,3);
#pragma omp parallel for
for(i=0; i<(s->height*s->width); i++)
{
asubsref(s,i) = asubsref(s,i) * M_STDDEV_VEL;
}
fFreeHandle(*vel);
*vel = fPlus(is, s);
fFreeHandle(is);
fFreeHandle(s);
}
}
}
我已經添加了幾個OpenMP的相似之處,但它仍然是跑得非常緩慢,所以我希望如果你老練的老將能指出我應該看哪些位置來提高表現。
您正在使用OpenMP部分。你真的確定,這兩個代碼段(在每個'section'結構中)是否可以獨立執行,即它們之間沒有數據依賴關係?非常緩慢的執行通常是線程之間共享(虛假)的標誌。 –