0
我想並行化一個名爲streamcluster的程序。更具體地說,名爲pgain的函數根據我使用的Scalasca工具花費程序的大部分時間,所以這是我應該並行化的函數。在這裏你可以看到這個功能和我的並行化工作。問題在於,我所取得的唯一成果是程序員花更多時間來執行。與openmp並行化streamcluster
原來pgain功能streamcluster:
double pgain (long x, Points *points, double z, long int *numcenters)
{
int i;
int number_of_centers_to_close = 0;
static double *work_mem;
static double gl_cost_of_opening_x;
static int gl_number_of_centers_to_close;
int stride = *numcenters + 2;
//make stride a multiple of CACHE_LINE
int cl = CACHE_LINE/sizeof (double);
if (stride % cl != 0) {
stride = cl * (stride/cl + 1);
}
int K = stride - 2 ; // K==*numcenters
//my own cost of opening x
double cost_of_opening_x = 0;
work_mem = (double*) malloc (2 * stride * sizeof (double));
gl_cost_of_opening_x = 0;
gl_number_of_centers_to_close = 0;
/*
* For each center, we have a *lower* field that indicates
* how much we will save by closing the center.
*/
int count = 0;
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
center_table[i] = count++;
}
}
work_mem[0] = 0;
//now we finish building the table. clear the working memory.
memset (switch_membership, 0, points->num * sizeof (bool));
memset (work_mem, 0, stride*sizeof (double));
memset (work_mem+stride,0,stride*sizeof (double));
//my *lower* fields
double* lower = &work_mem[0];
//global *lower* fields
double* gl_lower = &work_mem[stride];
for (i = 0; i < points->num; i++) {
float x_cost = dist (points->p[i], points->p[x], points->dim) * points->p[i].weight;
float current_cost = points->p[i].cost;
if (x_cost < current_cost) {
// point i would save cost just by switching to x
// (note that i cannot be a median,
// or else dist(p[i], p[x]) would be 0)
switch_membership[i] = 1;
cost_of_opening_x += x_cost - current_cost;
} else {
// cost of assigning i to x is at least current assignment cost of i
// consider the savings that i's **current** median would realize
// if we reassigned that median and all its members to x;
// note we've already accounted for the fact that the median
// would save z by closing; now we have to subtract from the savings
// the extra cost of reassigning that median and its members
int assign = points->p[i].assign;
lower[center_table[assign]] += current_cost - x_cost;
}
}
// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
double low = z + work_mem[center_table[i]];
gl_lower[center_table[i]] = low;
if (low > 0) {
// i is a median, and
// if we were to open x (which we still may not) we'd close i
// note, we'll ignore the following quantity unless we do open x
++number_of_centers_to_close;
cost_of_opening_x -= low;
}
}
}
//use the rest of working memory to store the following
work_mem[K] = number_of_centers_to_close;
work_mem[K+1] = cost_of_opening_x;
gl_number_of_centers_to_close = (int) work_mem[K];
gl_cost_of_opening_x = z + work_mem[K+1];
// Now, check whether opening x would save cost; if so, do it, and
// otherwise do nothing
if (gl_cost_of_opening_x < 0) {
// we'd save money by opening x; we'll do it
for (int i = 0; i < points->num; i++) {
bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;
if (switch_membership[i] || close_center) {
// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
points->p[i].cost = points->p[i].weight * dist (points->p[i], points->p[x], points->dim);
points->p[i].assign = x;
}
}
for (int i = 0; i < points->num; i++) {
if (is_center[i] && gl_lower[center_table[i]] > 0) {
is_center[i] = false;
}
}
if (x >= 0 && x < points->num) {
is_center[x] = true;
}
*numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
} else {
gl_cost_of_opening_x = 0; // the value we'll return
}
free (work_mem);
return -gl_cost_of_opening_x;
}
而這正是我所做的並行化:
double pgain (long x, Points *points, double z, long int *numcenters)
{
int i;
int number_of_centers_to_close = 0;
static double *work_mem;
static double gl_cost_of_opening_x;
static int gl_number_of_centers_to_close;
int stride = *numcenters + 2;
//make stride a multiple of CACHE_LINE
int cl = CACHE_LINE/sizeof (double);
if (stride % cl != 0) {
stride = cl * (stride/cl + 1);
}
int K = stride - 2 ; // K==*numcenters
//my own cost of opening x
double cost_of_opening_x = 0;
work_mem = (double*) malloc (2 * stride * sizeof (double));
gl_cost_of_opening_x = 0;
gl_number_of_centers_to_close = 0;
/*
* For each center, we have a *lower* field that indicates
* how much we will save by closing the center.
*/
int count = 0;
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
center_table[i] = count++;
}
}
work_mem[0] = 0;
//now we finish building the table. clear the working memory.
memset (switch_membership, 0, points->num * sizeof (bool));
memset (work_mem, 0, stride*sizeof (double));
memset (work_mem+stride,0,stride*sizeof (double));
//my *lower* fields
double* lower = &work_mem[0];
//global *lower* fields
double* gl_lower = &work_mem[stride];
float x_cost=0.0;
float current_cost=0.0;
#pragma omp parallel for private(current_cost,x_cost)
shared(cost_of_opening_x)
for (i = 0; i < points->num; i++) {
x_cost = dist (points->p[i], points->p[x], points->dim) * points->p[i].weight;
current_cost = points->p[i].cost;
if (x_cost < current_cost) {
// point i would save cost just by switching to // x
// (note that i cannot be a median,
// or else dist(p[i], p[x]) would be 0)
switch_membership[i] = 1;
cost_of_opening_x += x_cost - current_cost;
{
#pragma omp flush(cost_of_opening_x)
}
} else {
// cost of assigning i to x is at least current assignment cost of i
// consider the savings that i's **current** median would realize
// if we reassigned that median and all its members to x;
// note we've already accounted for the fact that the median
// would save z by closing; now we have to subtract from the savings
// the extra cost of reassigning that median and its members
int assign = points->p[i].assign;
lower[center_table[assign]] += current_cost - x_cost;
{
#pragma omp flush(lower)
}
}
#pragma omp barrier
{
#pragma omp flush(lower,cost_of_opening_x)
}
}
// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it
for (int i = 0; i < points->num; i++) {
if (is_center[i]) {
double low = z + work_mem[center_table[i]];
gl_lower[center_table[i]] = low;
if (low > 0) {
// i is a median, and
// if we were to open x (which we still may not) we'd close i
// note, we'll ignore the following quantity unless we do open x
++number_of_centers_to_close;
cost_of_opening_x -= low;
}
}
}
//use the rest of working memory to store the following
work_mem[K] = number_of_centers_to_close;
work_mem[K+1] = cost_of_opening_x;
gl_number_of_centers_to_close = (int) work_mem[K];
gl_cost_of_opening_x = z + work_mem[K+1];
// Now, check whether opening x would save cost; if so, do it, and
// otherwise do nothing
if (gl_cost_of_opening_x < 0) {
// we'd save money by opening x; we'll do it
#pragma omp parallel for
for (int i = 0; i < points->num; i++) {
bool close_center = gl_lower[center_table[points->p[i].assign]] > 0
;
if (switch_membership[i] || close_center) {
// Either i's median (which may be i itself) is closing,
// or i is closer to x than to its current median
points->p[i].cost = points->p[i].weight * dist (points->p[i],
points->p[x], points->dim);
points->p[i].assign = x;
}
}
for (int i = 0; i < points->num; i++) {
if (is_center[i] && gl_lower[center_table[i]] > 0) {
is_center[i] = false;
}
}
if (x >= 0 && x < points->num) {
is_center[x] = true;
}
*numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
} else {
gl_cost_of_opening_x = 0; // the value we'll return
}
free (work_mem);
return -gl_cost_of_opening_x;
}
你能看到的任何改進或變更,這將使它更快嗎?先謝謝你。