2016-01-16 78 views
1

我有一個代碼。這裏,A,B,C,A1,B1,C1是3維的向量。 A,B,C是獨立的,A1,B1,C1也是獨立的。我想通過使用openmp進行並行計算。但是,我用openmp運行它,我得到「分段錯誤」錯誤。您能幫我解決這個問題嗎?先謝謝你。通過openmp並行化C++中的three_for_loop

#include <omp.h> 
#include<math.h> 
#include<cmath> 
#include<vector>  
#include<iostream> 

using namespace std; 
int main() 
{ 

int NX=801;    // NUmber of grid in X direction 
int NY=501;    
int NZ=401; 
float PI=3.14159265358979323846; 
unsigned int i,j,k; 
vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 

cout<<"start"<<endl; 
#pragma omp parallel for private (j) shared(A,B,C,i,k,NX,NY,NZ) 
for (i=0;i<NX;i++) 
    for (j=0;j<NY;j++) 
     for (k=0;k<NZ;k++) 
     { 
      A[i][j][k]=sin(2.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
      B[i][j][k]=cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
      C[i][j][k]=sin(2.0*PI/float(NX*NY*NZ))*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
     } 

#pragma omp parallel for private (j) shared(A1,B1,C1,A,B,C,i,k,NX,NY,NZ) 
for (i=1;i<NX-1;i++) 
    for (j=1;j<NY-1;j++) 
     for (k=1;k<NZ-1;k++) 
     { 
      A1[i][j][k]=C[i+1][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
      B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
      C1[i][j][k]=16.0*A[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
     } 
cout<<"finish"<<endl; 


return 0; 
} 

回答

2

該代碼非常容易與OpenMP並行化。但是,您在自己的嘗試中犯了一些錯誤,特別是通過嘗試聲明ikshared,而他們實際上應該是private。更好的是,不要事先聲明變量,只需在for循環內聲明它們即可。這樣,他們將自動擁有正確的範圍,防止您將其混淆。

下面是它會給:

#include <omp.h> 
#include<math.h> 
#include<cmath> 
#include<vector>  
#include<iostream> 

using namespace std; 
int main() 
{ 

    int NX=801;    // NUmber of grid in X direction 
    int NY=501;    
    int NZ=401; 
    float PI=3.14159265358979323846; 
    vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 

    cout<<"start"<<endl; 
    #pragma omp parallel for 
    for (int i=0;i<NX;i++) 
     for (int j=0;j<NY;j++) 
      for (int k=0;k<NZ;k++) 
      { 
       A[i][j][k]=sin(2.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
       B[i][j][k]=cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
       C[i][j][k]=sin(2.0*PI/float(NX*NY*NZ))*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
      } 

    #pragma omp parallel for 
    for (int i=1;i<NX-1;i++) 
     for (int j=1;j<NY-1;j++) 
      for (int k=1;k<NZ-1;k++) 
      { 
       A1[i][j][k]=C[i+1][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
       B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
       C1[i][j][k]=16.0*A[i][j][k]*cos(5.0*PI/float(NX*NY*NZ)*float(i*j*k)); 
      } 
    cout<<"finish"<<endl; 

    return 0; 
} 

現在,既然你問parallelising這個代碼,我猜你有興趣的表現。所以沒有什麼應該阻止你實現一個或兩個非常基本性能的優化是這樣的:

#include <omp.h> 
#include<math.h> 
#include<cmath> 
#include<vector>  
#include<iostream> 

using namespace std; 
int main() 
{ 

    int NX=801;    // NUmber of grid in X direction 
    int NY=501;    
    int NZ=401; 
    float PI=3.14159265358979323846; 
    vector<vector<vector<float> > > A (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > B (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > C (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > A1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > B1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 
    vector<vector<vector<float> > > C1 (NX,vector<vector<float> >(NY,vector <float>(NZ,0.0))); 

    const float PIOverSize = PI/(NX*NY*NZ); 
    const float sin2PIOverSize = sin(2.0f*PIOverSize); 
    cout<<"start"<<endl; 
    double tbeg = omp_get_wtime(); 
    #pragma omp parallel 
    { 
    #pragma omp for 
    for (int i=0;i<NX;i++) 
     for (int j=0;j<NY;j++) 
     { 
      float IJPIOverSize=i*j*PIOverSize; 
      for (int k=0;k<NZ;k++) 
      { 
       A[i][j][k]=sin(2.0f*IJPIOverSize*k); 
       B[i][j][k]=cos(5.0f*IJPIOverSize*k); 
       C[i][j][k]=sin2PIOverSize*cos(5.0f*IJPIOverSize*k); 
      } 
     } 
    #pragma omp for 
    for (int i=1;i<NX-1;i++) 
     for (int j=1;j<NY-1;j++) 
     { 
      float IJPIOverSize=i*j*PIOverSize; 
      for (int k=1;k<NZ-1;k++) 
      { 
       A1[i][j][k]=C[i+1][j][k]*cos(5.0f*IJPIOverSize*k); 
       B1[i][j][k]=A[i][j][k]+B[i][j][k]+C[i][j][k]*cos(5.0f*IJPIOverSize*k); 
       C1[i][j][k]=16.0f*A[i][j][k]*cos(5.0f*IJPIOverSize*k); 
      } 
     } 
    } 
    double time = omp_get_wtime() - tbeg; 
    cout<<"finish in "<<time<<" seconds"<<endl; 

    return 0; 
} 

有了這個,你的代碼應該已經快很多。

+0

現在,我完全理解你的並行化方式。當它在我的電腦上完成時,它的運行速度比沒有並行化時快五倍。我試圖儘可能多地計算代碼,因此看起來非常混亂。但是,您優化的方式非常美麗。非常感謝你。我從你的代碼中學到了很多東西。 –

+0

根據NX和NY的大小,並行for循環中的「collapse(2)」也可能有所幫助,因爲它會暴露更多的並行性。 –