Pthread互斥VS Solaris中

原子OPS我正在做一些測試用一個簡單的程序測量上使用atomic_add_64 Vs的互斥鎖方法一個64位的值的簡單原子增量的性能。什麼我百思不得其解的是atomic_add是2Pthread互斥VS Solaris中

編輯的因素不是互斥鎖慢！我已經做了更多的測試。看起來像原子比互斥更快，可擴展到8個併發線程。之後，原子的性能顯着降低。

我測試的平臺是：

的SunOS 5.10 Generic_141444-09 sun4u的SPARC SUNW，太陽火V490

CC：孫C++ 5.9 SunOS_sparc補丁124863-03 2008/03/12

程序很簡單：

#include <stdio.h> 
#include <stdint.h> 
#include <pthread.h> 
#include <atomic.h> 

uint64_t  g_Loops = 1000000; 
volatile uint64_t  g_Counter = 0; 
volatile uint32_t  g_Threads = 20; 

pthread_mutex_t g_Mutex; 
pthread_mutex_t g_CondMutex; 
pthread_cond_t g_Condition; 

void LockMutex() 
{ 
    pthread_mutex_lock(&g_Mutex); 
} 

void UnlockMutex() 
{ 
    pthread_mutex_unlock(&g_Mutex); 
} 

void InitCond() 
{ 
    pthread_mutex_init(&g_CondMutex, 0); 
    pthread_cond_init(&g_Condition, 0); 
} 

void SignalThreadEnded() 
{ 
    pthread_mutex_lock(&g_CondMutex); 
    --g_Threads; 
    pthread_cond_signal(&g_Condition); 
    pthread_mutex_unlock(&g_CondMutex); 
} 

void* ThreadFuncMutex(void* arg) 
{ 
    uint64_t counter = g_Loops; 
    while(counter--) 
    { 
     LockMutex(); 
     ++g_Counter; 
     UnlockMutex(); 
    } 
    SignalThreadEnded(); 
    return 0; 
} 

void* ThreadFuncAtomic(void* arg) 
{ 
    uint64_t counter = g_Loops; 
    while(counter--) 
    { 
     atomic_add_64(&g_Counter, 1); 
    } 
    SignalThreadEnded(); 
    return 0; 
} 


int main(int argc, char** argv) 
{ 
    pthread_mutex_init(&g_Mutex, 0); 
    InitCond(); 
    bool bMutexRun = true; 
    if(argc > 1) 
    { 
     bMutexRun = false; 
     printf("Atomic run!\n"); 
    } 
    else 
     printf("Mutex run!\n"); 

    // start threads 
    uint32_t threads = g_Threads; 
    while(threads--) 
    { 
     pthread_t thr; 
     if(bMutexRun) 
     pthread_create(&thr, 0,ThreadFuncMutex, 0); 
     else 
     pthread_create(&thr, 0,ThreadFuncAtomic, 0); 
    } 
    pthread_mutex_lock(&g_CondMutex); 
    while(g_Threads) 
    { 
     pthread_cond_wait(&g_Condition, &g_CondMutex); 
     printf("Threads to go %d\n", g_Threads); 
    } 
    printf("DONE! g_Counter=%ld\n", (long)g_Counter); 
}

我們盒試運行的結果是：

$ CC -o atomictest atomictest.C 
$ time ./atomictest 
Mutex run! 
Threads to go 19 
... 
Threads to go 0 
DONE! g_Counter=20000000 

real 0m15.684s 
user 0m52.748s 
sys  0m0.396s 

$ time ./atomictest 1 
Atomic run! 
Threads to go 19 
... 
Threads to go 0 
DONE! g_Counter=20000000 

real 0m24.442s 
user 3m14.496s 
sys  0m0.068s

你碰到這種類型在Solaris性能差異？任何想法爲什麼發生這種情況

在Linux相同的代碼（用gcc __sync_fetch_and_add）產生在所述互斥verstion 5倍的性能提升。

感謝， Octav

來源

2012-04-16 Octav Chiriac

這實在不應該來作爲所有巨大的驚喜。「無鎖」不一定意味着「更快」。 – 2012-04-16 15:40:38

我同意這一點，儘管通常認爲原子操作比互斥鎖更高效，至少對於像這樣的簡單情況，即Windows上的InterlockedIncrement比CriticalSection +增量快，Linux上的_sync_fetch_and_add比pthreads_mutex_lock快得多+增量。爲什麼我們會在Solaris上使用原子操作，如果它們比Mutex-es慢兩倍？用原子能產生更好的性能的用例是什麼？ – 2012-04-16 15:49:55

請在發佈前正確地縮進您的代碼。 – 2012-04-16 15:54:19

你必須要小心，這裏發生了什麼。

1）它需要顯著的時間來創建一個線程。因此，它可能並不是所有的線程都在同時執行。作爲證據，我拿走了您的代碼並刪除了互斥鎖，並在每次運行時都得到了正確的答案。這意味着沒有任何線程正在同時執行！你不應該把時間計算在測試中創建/破壞線程。在開始測試之前，您應該等到所有線程都已創建並運行。

2）您的測試不公平。您的測試具有非常高的鎖定爭用。無論出於何種原因，原子add_and_fetch都會遇到這種情況。在現實生活中，你會在線程中做一些工作。一旦你添加了甚至一點點的工作，原子操作表現更好。這是因爲比賽條件的機會顯着下降。沒有爭用時，原子操作的開銷較低。沒有爭用時，互斥的開銷比原子操作更多。

3）線程數。運行的線程越少，爭用就越少。這就是爲什麼在這個測試中更少的線程對原子更好。您的8個線程號碼可能是是您的系統支持的併發線程數。這可能不是因爲你的測試對爭用有偏差。在我看來，你的測試可以擴展到允許的併發線程數量，然後平穩。我不明白的一件事是，爲什麼當線程數超過系統可以處理的同時線程數時，我們看不到線程睡眠時互斥鎖被鎖定的情況。也許我們做了，我只是看不到它發生。

底線，原子能是最真實的生活情況快了很多。當你長時間持鎖時，它們不是很好......反正你應該避免的東西（至少在我看來！）

我改變了你的代碼，所以你可以測試沒有工作，幾乎沒有任何工作，多一點工作，以及更改線程數。

6SM = 6個線程，幾乎沒有任何工作，互斥 6S = 6個線程，幾乎沒有任何工作，原子

使用國會s到完成更多的工作，且不能有S讓沒有工作。

這些結果表明，使用10個線程，工作量會影響原子的速度。在第一種情況下，沒有工作，而原子幾乎沒有更快。加一點點工作，差距翻倍至6秒，而且大量工作，幾乎達到10秒。

(2) /dev_tools/Users/c698174/temp/atomic 
[[email protected]] $ t=10; a.out $t ; a.out "$t"m 
ATOMIC FAST g_Counter=10000000 13.6520 s 
MUTEX FAST g_Counter=10000000 15.2760 s 

(2) /dev_tools/Users/c698174/temp/atomic 
[[email protected]] $ t=10s; a.out $t ; a.out "$t"m 
ATOMIC slow g_Counter=10000000 11.4957 s 
MUTEX slow g_Counter=10000000 17.9419 s 

(2) /dev_tools/Users/c698174/temp/atomic 
[[email protected]] $ t=10S; a.out $t ; a.out "$t"m 
ATOMIC SLOW g_Counter=10000000 14.7108 s 
MUTEX SLOW g_Counter=10000000 23.8762 s

20個線程，原子仍然更好，但邊緣較小。沒有工作，他們幾乎是一樣的速度。通過大量的工作，原子再次領先。

(2) /dev_tools/Users/c698174/temp/atomic 
[[email protected]] $ t=20; a.out $t ; a.out "$t"m 
ATOMIC FAST g_Counter=20000000 27.6267 s 
MUTEX FAST g_Counter=20000000 30.5569 s 

(2) /dev_tools/Users/c698174/temp/atomic 
[[email protected]] $ t=20S; a.out $t ; a.out "$t"m 
ATOMIC SLOW g_Counter=20000000 35.3514 s 
MUTEX SLOW g_Counter=20000000 48.7594 s

2個主題。原子占主導地位。

(2) /dev_tools/Users/c698174/temp/atomic 
[[email protected]] $ t=2S; a.out $t ; a.out "$t"m 
ATOMIC SLOW g_Counter=2000000 0.6007 s 
MUTEX SLOW g_Counter=2000000 1.4966 s

下面是代碼（RedHat Linux上，使用gcc原子能）：

#include <stdio.h> 
#include <stdint.h> 
#include <string.h> 
#include <pthread.h> 

volatile uint64_t __attribute__((aligned (64))) g_Loops = 1000000 ; 
volatile uint64_t __attribute__((aligned (64))) g_Counter = 0; 
volatile uint32_t __attribute__((aligned (64))) g_Threads = 7; 
volatile uint32_t __attribute__((aligned (64))) g_Active = 0; 
volatile uint32_t __attribute__((aligned (64))) g_fGo = 0; 
int g_fSlow = 0; 

#define true 1 
#define false 0 
#define NANOSEC(t) (1000000000ULL * (t).tv_sec + (t).tv_nsec) 

pthread_mutex_t g_Mutex; 
pthread_mutex_t g_CondMutex; 
pthread_cond_t g_Condition; 

void LockMutex() 
{ 
    pthread_mutex_lock(&g_Mutex); 
} 

void UnlockMutex() 
{ 
    pthread_mutex_unlock(&g_Mutex); 
} 

void Start(struct timespec *pT) 
{ 
    int cActive = __sync_add_and_fetch(&g_Active, 1); 
    while(!g_fGo) {} 
    clock_gettime(CLOCK_THREAD_CPUTIME_ID, pT); 
} 

uint64_t End(struct timespec *pT) 
{ 
    struct timespec T; 
    int cActive = __sync_sub_and_fetch(&g_Active, 1); 
    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &T); 
    return NANOSEC(T) - NANOSEC(*pT); 
} 
void Work(double *x, double z) 
{ 
     *x += z; 
     *x /= 27.6; 
     if ((uint64_t)(*x + .5) - (uint64_t)*x != 0) 
     *x += .7; 
} 
void* ThreadFuncMutex(void* arg) 
{ 
    struct timespec T; 
    uint64_t counter = g_Loops; 
    double x = 0, z = 0; 
    int fSlow = g_fSlow; 

    Start(&T); 
    if (!fSlow) { 
    while(counter--) { 
     LockMutex(); 
     ++g_Counter; 
     UnlockMutex(); 
    } 
    } else { 
    while(counter--) { 
     if (fSlow==2) Work(&x, z); 
     LockMutex(); 
     ++g_Counter; 
     z = g_Counter; 
     UnlockMutex(); 
    } 
    } 
    *(uint64_t*)arg = End(&T); 
    return (void*)(int)x; 
} 

void* ThreadFuncAtomic(void* arg) 
{ 
    struct timespec T; 
    uint64_t counter = g_Loops; 
    double x = 0, z = 0; 
    int fSlow = g_fSlow; 

    Start(&T); 
    if (!fSlow) { 
    while(counter--) { 
     __sync_add_and_fetch(&g_Counter, 1); 
    } 
    } else { 
    while(counter--) { 
     if (fSlow==2) Work(&x, z); 
     z = __sync_add_and_fetch(&g_Counter, 1); 
    } 
    } 
    *(uint64_t*)arg = End(&T); 
    return (void*)(int)x; 
} 


int main(int argc, char** argv) 
{ 
    int i; 
    int bMutexRun = strchr(argv[1], 'm') != NULL; 
    pthread_t thr[1000]; 
    uint64_t aT[1000]; 
    g_Threads = atoi(argv[1]); 
    g_fSlow = (strchr(argv[1], 's') != NULL) ? 1 : ((strchr(argv[1], 'S') != NULL) ? 2 : 0); 

    // start threads 
    pthread_mutex_init(&g_Mutex, 0); 
    for (i=0 ; i<g_Threads ; ++i) 
     pthread_create(&thr[i], 0, (bMutexRun) ? ThreadFuncMutex : ThreadFuncAtomic, &aT[i]); 

    // wait 
    while (g_Active != g_Threads) {} 
    g_fGo = 1; 
    while (g_Active != 0) {} 

    uint64_t nTot = 0; 
    for (i=0 ; i<g_Threads ; ++i) 
    { 
     pthread_join(thr[i], NULL); 
     nTot += aT[i]; 
    } 
    // done 
    printf("%s %s g_Counter=%llu %2.4lf s\n", (bMutexRun) ? "MUTEX " : "ATOMIC", 
    (g_fSlow == 2) ? "SLOW" : ((g_fSlow == 1) ? "slow" : "FAST"), g_Counter, (double)nTot/1e9); 
}

來源

2012-04-28 04:37:13 johnnycrash

Pthread互斥VS Solaris中

回答

相關問題