C-pthreads似乎只使用一個核心

讓我首先說這是爲了學校，但我真的不需要幫助，我只是因爲我得到的一些結果而感到困惑。C-pthreads似乎只使用一個核心

我有一個簡單的程序，使用辛普森的規則逼近pi，在一個任務中，我們必須通過產生4個子進程來完成此任務，現在在這個任務中我們必須使用4個內核級線程。我已經這樣做了，但是當我計劃使用子進程的時候，運行速度似乎更快（我得到的印象我應該會看到相反的結果）。

下面是使用並行線程的程序：

#include <stdio.h> 
#include <unistd.h> 
#include <pthread.h> 
#include <stdlib.h> 

// This complicated ternary statement does the bulk of our work. 
// Basically depending on whether or not we're at an even number in our 
// sequence we'll call the function with x/32000 multiplied by 2 or 4. 
#define TERN_STMT(x) (((int)x%2==0)?2*func(x/32000):4*func(x/32000)) 

// Set to 0 for no 100,000 runs 
#define SPEED_TEST 1 

struct func_range { 
    double start; 
    double end; 
}; 

// The function defined in the assignment 
double func(double x) 
{ 
    return 4/(1 + x*x); 
} 

void *partial_sum(void *r) 
{ 
    double *ret = (double *)malloc(sizeof(double)); 
    struct func_range *range = r; 
#if SPEED_TEST 
    int k; 
    double begin = range->start; 
    for (k = 0; k < 25000; k++) 
    { 
    range->start = begin; 
    *ret = 0; 
#endif 
    for (; range->start <= range->end; ++range->start) 
     *ret += TERN_STMT(range->start); 
#if SPEED_TEST 
    } 
#endif 

    return ret; 
} 

int main() 
{ 
    // An array for our threads. 
    pthread_t threads[4]; 
    double total_sum = func(0); 
    void *temp; 
    struct func_range our_range; 
    int i; 

    for (i = 0; i < 4; i++) 
    { 
    our_range.start = (i == 0) ? 1 : (i == 1) ? 8000 : (i == 2) ? 16000 : 24000; 
    our_range.end = (i == 0) ? 7999 : (i == 1) ? 15999 : (i == 2) ? 23999 : 31999; 
    pthread_create(&threads[i], NULL, &partial_sum, &our_range); 
    pthread_join(threads[i], &temp); 
    total_sum += *(double *)temp; 
    free(temp); 
    } 

    total_sum += func(1); 

    // Final calculations 
    total_sum /= 3.0; 
    total_sum *= (1.0/32000.0); 

    // Print our result 
    printf("%f\n", total_sum); 

    return EXIT_SUCCESS; 
}

下面是一個使用子進程：

#include <stdio.h> 
#include <unistd.h> 
#include <stdlib.h> 

// This complicated ternary statement does the bulk of our work. 
// Basically depending on whether or not we're at an even number in our 
// sequence we'll call the function with x/32000 multiplied by 2 or 4. 
#define TERN_STMT(x) (((int)x%2==0)?2*func(x/32000):4*func(x/32000)) 

// Set to 0 for no 100,000 runs 
#define SPEED_TEST 1 

// The function defined in the assignment 
double func(double x) 
{ 
    return 4/(1 + x*x); 
} 

int main() 
{ 
    // An array for our subprocesses. 
    pid_t pids[4]; 
    // The pipe to pass-through information 
    int mypipe[2]; 
    // Counter for subproccess loops 
    double j; 
    // Counter for outer loop 
    int i; 
    // Number of PIDs 
    int n = 4; 
    // The final sum 
    double total_sum = 0; 
    // Temporary variable holding the result from a subproccess 
    double temp; 
    // The partial sum tallied by a subproccess. 
    double sum = 0; 
    int k; 

    if (pipe(mypipe)) 
    { 
    perror("pipe"); 
    return EXIT_FAILURE; 
    } 

    // Create the PIDs 
    for (i = 0; i < 4; i++) 
    { 
    // Abort if something went wrong 
    if ((pids[i] = fork()) < 0) 
    { 
     perror("fork"); 
     abort(); 
    } 
    else if (pids[i] == 0) 
    // Depending on what PID number we are we'll only calculate 
     // 1/4 the total. 
#if SPEED_TEST 
     for (k = 0; k < 25000; ++k) 
     { 
     sum = 0; 
#endif 
     switch (i) 
     { 
      case 0: 
      sum += func(0); 
      for (j = 1; j <= 7999; ++j) 
       sum += TERN_STMT(j); 
      break; 
      case 1: 
      for (j = 8000; j <= 15999; ++j) 
       sum += TERN_STMT(j); 
      break; 
      case 2: 
      for (j = 16000; j <= 23999; ++j) 
       sum += TERN_STMT(j); 
      break; 
      case 3: 
      for (j = 24000; j < 32000; ++j) 
       sum += TERN_STMT(j); 
      sum += func(1); 
      break; 
     } 
#if SPEED_TEST 
     } 
#endif 
     // Write the data to the pipe 
     write(mypipe[1], &sum, sizeof(sum)); 
     exit(0); 
    } 
    } 

    int status; 
    pid_t pid; 
    while (n > 0) 
    { 
    // Wait for the calculations to finish 
    pid = wait(&status); 
    // Read from the pipe 
    read(mypipe[0], &temp, sizeof(total_sum)); 
    // Add to the total 
    total_sum += temp; 
    n--; 
    } 

    // Final calculations 
    total_sum /= 3.0; 
    total_sum *= (1.0/32000.0); 

    // Print our result 
    printf("%f\n", total_sum); 

    return EXIT_SUCCESS; 
}

這裏是並行線程版本time結果運行10萬次：

real 11.15 
user 11.15 
sys 0.00

這裏是子進程版本：

real 5.99 
user 23.81 
sys 0.00

擁有23.81的用戶時間意味着這是每個內核執行代碼所花時間的總和。在pthread分析中，實際/用戶時間相同，這意味着僅使用一個內核。爲什麼不使用全部4核？我認爲默認情況下它可能比子進程更好。

希望這個問題是有道理的，這是我第一次用pthread編程，而且我對OS級編程一般都很陌生。

感謝您花時間閱讀這個冗長的問題。

來源

2014-02-14 LainIwakura

當您在pthread_create之後立即說出pthread_join時，您將有效地序列化所有線程。直到創建完所有線程並完成了所有不需要線程計算結果的其他工作之後，才能加入線程。

來源

2014-02-14 00:24:51

我將for循環外面的'pthread_join'移動到了它自己的for循環中，並添加了free和free，但是現在每次都有不同的答案......顯然，我並不瞭解線程的一些內容，是有什麼我能讀懂的，展示瞭如何解決這類問題？ – LainIwakura

@LainIwakura：你的意思是「請解釋多線程編程如何在雙行註釋中起作用」？恐怕這有點超出範圍。你的班級沒有教這個嗎？ –

我討厭說這個，但是這個班很爛。我想出了爲什麼我得到了不同的值 - 線程共享範圍結構。我通過建立一系列結構來解決這個問題......現在我得到了正確的答案，但程序更慢了！（16.52實時，65.05用戶時間！）它看起來像我的核心正在使用，但性能沒有超過程序的版本使用fork（） – LainIwakura

C-pthreads似乎只使用一個核心

回答

相關問題