與openmp並行化streamcluster

我想並行化一個名爲streamcluster的程序。更具體地說，名爲pgain的函數根據我使用的Scalasca工具花費程序的大部分時間，所以這是我應該並行化的函數。在這裏你可以看到這個功能和我的並行化工作。問題在於，我所取得的唯一成果是程序員花更多時間來執行。與openmp並行化streamcluster

原來pgain功能streamcluster：

double pgain (long x, Points *points, double z, long int *numcenters) 
{ 
int i; 
int number_of_centers_to_close = 0; 

static double *work_mem; 
static double gl_cost_of_opening_x; 
static int gl_number_of_centers_to_close; 

int stride = *numcenters + 2; 
//make stride a multiple of CACHE_LINE 
int cl = CACHE_LINE/sizeof (double); 
if (stride % cl != 0) { 
    stride = cl * (stride/cl + 1); 
} 
int K = stride - 2 ; // K==*numcenters 

//my own cost of opening x 
double cost_of_opening_x = 0; 

work_mem = (double*) malloc (2 * stride * sizeof (double)); 
gl_cost_of_opening_x = 0; 
gl_number_of_centers_to_close = 0; 

/* 
* For each center, we have a *lower* field that indicates 
* how much we will save by closing the center. 
*/ 
int count = 0; 
for (int i = 0; i < points->num; i++) { 
    if (is_center[i]) { 
     center_table[i] = count++; 
    } 
} 
work_mem[0] = 0; 

//now we finish building the table. clear the working memory. 
memset (switch_membership, 0, points->num * sizeof (bool)); 
memset (work_mem, 0, stride*sizeof (double)); 
memset (work_mem+stride,0,stride*sizeof (double)); 

//my *lower* fields 
double* lower = &work_mem[0]; 
//global *lower* fields 
double* gl_lower = &work_mem[stride]; 

for (i = 0; i < points->num; i++) { 
    float x_cost = dist (points->p[i], points->p[x], points->dim) * points->p[i].weight; 
    float current_cost = points->p[i].cost; 

    if (x_cost < current_cost) { 

     // point i would save cost just by switching to x 
     // (note that i cannot be a median, 
     // or else dist(p[i], p[x]) would be 0) 

     switch_membership[i] = 1; 
     cost_of_opening_x += x_cost - current_cost; 

    } else { 

     // cost of assigning i to x is at least current assignment cost of i 

     // consider the savings that i's **current** median would realize 
     // if we reassigned that median and all its members to x; 
     // note we've already accounted for the fact that the median 
     // would save z by closing; now we have to subtract from the savings 
     // the extra cost of reassigning that median and its members 
     int assign = points->p[i].assign; 
     lower[center_table[assign]] += current_cost - x_cost; 
    } 
} 

// at this time, we can calculate the cost of opening a center 
// at x; if it is negative, we'll go through with opening it 

for (int i = 0; i < points->num; i++) { 
    if (is_center[i]) { 
     double low = z + work_mem[center_table[i]]; 
     gl_lower[center_table[i]] = low; 
     if (low > 0) { 
      // i is a median, and 
      // if we were to open x (which we still may not) we'd close i 

      // note, we'll ignore the following quantity unless we do open x 
      ++number_of_centers_to_close; 
      cost_of_opening_x -= low; 
     } 
    } 
} 
//use the rest of working memory to store the following 
work_mem[K] = number_of_centers_to_close; 
work_mem[K+1] = cost_of_opening_x; 

gl_number_of_centers_to_close = (int) work_mem[K]; 
gl_cost_of_opening_x = z + work_mem[K+1]; 

// Now, check whether opening x would save cost; if so, do it, and 
// otherwise do nothing 

if (gl_cost_of_opening_x < 0) { 
    // we'd save money by opening x; we'll do it 
    for (int i = 0; i < points->num; i++) { 
     bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ; 
     if (switch_membership[i] || close_center) { 
      // Either i's median (which may be i itself) is closing, 
      // or i is closer to x than to its current median 
      points->p[i].cost = points->p[i].weight * dist (points->p[i], points->p[x], points->dim); 
      points->p[i].assign = x; 
     } 
    } 
    for (int i = 0; i < points->num; i++) { 
     if (is_center[i] && gl_lower[center_table[i]] > 0) { 
      is_center[i] = false; 
     } 
    } 
    if (x >= 0 && x < points->num) { 
     is_center[x] = true; 
    } 

    *numcenters = *numcenters + 1 - gl_number_of_centers_to_close; 
} else { 
    gl_cost_of_opening_x = 0; // the value we'll return 
} 

free (work_mem); 

return -gl_cost_of_opening_x; 
}

而這正是我所做的並行化：

double pgain (long x, Points *points, double z, long int *numcenters) 
{ 
int i; 
int number_of_centers_to_close = 0; 

static double *work_mem; 
static double gl_cost_of_opening_x; 
static int gl_number_of_centers_to_close; 

int stride = *numcenters + 2; 
//make stride a multiple of CACHE_LINE 
int cl = CACHE_LINE/sizeof (double); 
if (stride % cl != 0) { 
    stride = cl * (stride/cl + 1); 
} 
int K = stride - 2 ; // K==*numcenters 

//my own cost of opening x 
double cost_of_opening_x = 0; 

work_mem = (double*) malloc (2 * stride * sizeof (double)); 
gl_cost_of_opening_x = 0; 
gl_number_of_centers_to_close = 0; 

/* 
* For each center, we have a *lower* field that indicates 
* how much we will save by closing the center. 
*/ 
int count = 0; 
for (int i = 0; i < points->num; i++) { 
    if (is_center[i]) { 
     center_table[i] = count++; 
    } 
} 
work_mem[0] = 0; 

//now we finish building the table. clear the working memory. 
memset (switch_membership, 0, points->num * sizeof (bool)); 
memset (work_mem, 0, stride*sizeof (double)); 
memset (work_mem+stride,0,stride*sizeof (double)); 

//my *lower* fields 
double* lower = &work_mem[0]; 
//global *lower* fields 
double* gl_lower = &work_mem[stride]; 
float x_cost=0.0; 
float current_cost=0.0; 

#pragma omp parallel for private(current_cost,x_cost) 
shared(cost_of_opening_x) 


for (i = 0; i < points->num; i++) { 

    x_cost = dist (points->p[i], points->p[x], points->dim) * points->p[i].weight; 
    current_cost = points->p[i].cost; 

    if (x_cost < current_cost) { 

     // point i would save cost just by switching to    // x 
     // (note that i cannot be a median, 
     // or else dist(p[i], p[x]) would be 0) 

     switch_membership[i] = 1; 

     cost_of_opening_x += x_cost - current_cost; 
     { 
     #pragma omp flush(cost_of_opening_x) 
     } 
    } else { 

     // cost of assigning i to x is at least current assignment cost of i 

     // consider the savings that i's **current** median would realize 
     // if we reassigned that median and all its members to x; 
     // note we've already accounted for the fact that the median 
     // would save z by closing; now we have to subtract from the savings 
     // the extra cost of reassigning that median and its members 
     int assign = points->p[i].assign; 

     lower[center_table[assign]] += current_cost - x_cost; 
     { 
     #pragma omp flush(lower) 
     } 
    } 
#pragma omp barrier 
{ 
#pragma omp flush(lower,cost_of_opening_x) 
} 
} 

// at this time, we can calculate the cost of opening a center 
// at x; if it is negative, we'll go through with opening it 
for (int i = 0; i < points->num; i++) { 
    if (is_center[i]) { 
     double low = z + work_mem[center_table[i]]; 
     gl_lower[center_table[i]] = low; 
     if (low > 0) { 
      // i is a median, and 
      // if we were to open x (which we still may not) we'd close i 

      // note, we'll ignore the following quantity unless we do open x 
      ++number_of_centers_to_close; 
      cost_of_opening_x -= low; 
     } 
    } 
} 
//use the rest of working memory to store the following 
work_mem[K] = number_of_centers_to_close; 
work_mem[K+1] = cost_of_opening_x; 

gl_number_of_centers_to_close = (int) work_mem[K]; 
gl_cost_of_opening_x = z + work_mem[K+1]; 

// Now, check whether opening x would save cost; if so, do it, and 
// otherwise do nothing 

if (gl_cost_of_opening_x < 0) { 
    // we'd save money by opening x; we'll do it 
    #pragma omp parallel for 
    for (int i = 0; i < points->num; i++) { 
     bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 
; 
     if (switch_membership[i] || close_center) { 
      // Either i's median (which may be i itself) is closing, 
      // or i is closer to x than to its current median 
      points->p[i].cost = points->p[i].weight * dist (points->p[i], 
points->p[x], points->dim); 
      points->p[i].assign = x; 
     } 
    } 
    for (int i = 0; i < points->num; i++) { 
     if (is_center[i] && gl_lower[center_table[i]] > 0) { 
      is_center[i] = false; 
     } 
    } 
    if (x >= 0 && x < points->num) { 
     is_center[x] = true; 
    } 

    *numcenters = *numcenters + 1 - gl_number_of_centers_to_close; 
} else { 
    gl_cost_of_opening_x = 0; // the value we'll return 
} 

free (work_mem); 

return -gl_cost_of_opening_x; 
}

你能看到的任何改進或變更，這將使它更快嗎？先謝謝你。

來源

2017-05-31 Mike Dos

我沒有承擔你的代碼的邏輯（串行和並行）。但我試圖找出可以並行化的一些片段。我沒有編譯我所建議的代碼。所以，我的答案的想法是指出一些可能使你的代碼更快。自然，必須驗證併發貨幣，必須分析共享和私有變量等。基於此，我的建議是：

double pgain (long x, Points *points, double z, long int *numcenters) 
{ 
    int i; 
    int number_of_centers_to_close = 0; 

    static double *work_mem; 
    static double gl_cost_of_opening_x; 
    static int gl_number_of_centers_to_close; 

    int stride = *numcenters + 2; 
    //make stride a multiple of CACHE_LINE 
    int cl = CACHE_LINE/sizeof (double); 
    if (stride % cl != 0) { 
     stride = cl * (stride/cl + 1); 
    } 
    int K = stride - 2 ; // K==*numcenters 

    //my own cost of opening x 
    double cost_of_opening_x = 0; 
    work_mem = (double*) malloc (2 * stride * sizeof (double)); 
    gl_cost_of_opening_x = 0; 
    gl_number_of_centers_to_close = 0; 

    int count = 0; 
    //my *lower* fields 
    double* lower; 
    //global *lower* fields 
    double* gl_lower; 

    #pragma omp parallel 
    { 
     /* 
     * For each center, we have a *lower* field that indicates 
     * how much we will save by closing the center. 
     */ 

     int i; 
     #pragma omp for private(i) 
     for (i = 0; i < points->num; i++) { 
      if (is_center[i]) { 
       #pragma omp critical 
       center_table[i] = count++; 
      } 
     } 

     #pragma omp single 
     work_mem[0] = 0; 

     #pragma omp sections 
     { 
      //now we finish building the table. clear the working memory. 
      #pragma omp section 
      memset (switch_membership, 0, points->num * sizeof (bool)); 

      #pragma omp section 
      memset (work_mem, 0, stride*sizeof (double)); 

      #pragma omp section 
      memset (work_mem+stride,0,stride*sizeof (double)); 
     } 

     #pragma omp single 
     { 
      lower = &work_mem[0]; 
      gl_lower = &work_mem[stride]; 
     } 

     float x_cost, current_cost; 

     #pragma omp for private(i, x_cost, current_cost) 
     for (i = 0; i < points->num; i++) { 

      x_cost = dist (points->p[i], points->p[x], points->dim) * points->p[i].weight; 
      current_cost = points->p[i].cost; 

      if (x_cost < current_cost) { 

       // point i would save cost just by switching to x 
       // (note that i cannot be a median, 
       // or else dist(p[i], p[x]) would be 0) 

       switch_membership[i] = 1; 

       #pragma omp critical 
       cost_of_opening_x += x_cost - current_cost; 

      } else { 

       // cost of assigning i to x is at least current assignment cost of i 

       // consider the savings that i's **current** median would realize 
       // if we reassigned that median and all its members to x; 
       // note we've already accounted for the fact that the median 
       // would save z by closing; now we have to subtract from the savings 
       // the extra cost of reassigning that median and its members 
       int assign = points->p[i].assign; 

       #pragma omp critical 
       lower[center_table[assign]] += current_cost - x_cost; 
      } 
     } 

     // at this time, we can calculate the cost of opening a center 
     // at x; if it is negative, we'll go through with opening it 

     double low; 

     #pragma omp for private(i, low) 
     for (int i = 0; i < points->num; i++) { 
      if (is_center[i]) { 
       low = z + work_mem[center_table[i]]; 
       #pragma omp critical 
       gl_lower[center_table[i]] = low; 
       if (low > 0) { 
        // i is a median, and 
        // if we were to open x (which we still may not) we'd close i 

        // note, we'll ignore the following quantity unless we do open x 
        #pragma omp atomic 
        ++number_of_centers_to_close; 
        #pragma omp critical 
        cost_of_opening_x -= low; 
       } 
      } 
     } 

     #pragma omp sections 
     { 
      //use the rest of working memory to store the following 
      #pragma omp section 
      work_mem[K] = number_of_centers_to_close; 

      #pragma omp section 
      work_mem[K+1] = cost_of_opening_x; 

      #pragma omp section 
      gl_number_of_centers_to_close = (int) work_mem[K]; 

      #pragma omp section 
      gl_cost_of_opening_x = z + work_mem[K+1]; 
     } 

     // Now, check whether opening x would save cost; if so, do it, and 
     // otherwise do nothing 
     bool close_center; 

     if (gl_cost_of_opening_x < 0) { 
      // we'd save money by opening x; we'll do it 
      #pragma omp for private(i) 
      for (i = 0; i < points->num; i++) { 
       close_center = gl_lower[center_table[points->p[i].assign]] > 0 ; 
       if (switch_membership[i] || close_center) { 
        // Either i's median (which may be i itself) is closing, 
        // or i is closer to x than to its current median 
        points->p[i].cost = points->p[i].weight * dist (points->p[i], points->p[x], points->dim); 
        points->p[i].assign = x; 
       } 
      } 
      #pragma omp for private(i) 
      for (i = 0; i < points->num; i++) { 
       if (is_center[i] && gl_lower[center_table[i]] > 0) { 
        is_center[i] = false; 
       } 
      } 
      if (x >= 0 && x < points->num) { 
       is_center[x] = true; 
      } 

      #pragma omp single 
      *numcenters = *numcenters + 1 - gl_number_of_centers_to_close; 
     } else { 
      #pragma omp single 
      gl_cost_of_opening_x = 0; // the value we'll return 
     } 

     #pragma omp single 
     free (work_mem); 
    } 

    return -gl_cost_of_opening_x; 
}

來源

2017-05-31 17:51:55 tnas

與openmp並行化streamcluster

回答

相關問題