2012-08-31 102 views
3

我正在學習排序算法,並且作爲下一步,我試圖讓我的實現執行接近std::sort()。我很遠,到目前爲止.. :-)快速排序優化

我有快速排序的3個實現:

  • 標準快速排序(使用溫度陣列)。
  • 快速排序與下列優化:用於選擇僅高達分區施加中值
  • 尾遞歸
  • 快速排序
    • median3尺寸< 16.對於較小的分區被用於插入排序。
    • 插入排序一次應用於整個數組,而不是應用於每個分區,左側未通過快速排序排序。
  • 上面列出的所有優化+就地分區(無臨時數組)的快排。

我預計自下而上的表現最好,但最好自頂向下!

我的實現出了什麼問題?鑑於表演之間的巨大差異,我認爲有些事情是完全錯誤的。

一些數字給你的感覺如何壞的東西都是(N =在陣列元件的數目,數字是由每個ALGO在微秒所花費的時間): 排序vector<int>並且每個ALGO給出恰好相同的數字序列。

N   quick  mixed  mixed_inplace 
8   0   0   0 
16   0   1   1 
32   1   2   2 
64   1   3   3 
128   1   8   8 
256   3   16   17 
512   6   34   41 
1,024  16   84   87 
2,048  28.3  177.1  233.2 
4,096  48.5  366.6  410.1 
8,192  146.5  833.5  1,012.6 
16,384  408.4  1,855.6  1,964.2 
32,768  1,343.5  3,895.0  4,241.7 
65,536  2,661.1  7,927.5  8,757.8 

使用Visual Studio Express的2010

CODE:

// ------------ QUICK SORT ------------------ 
template<typename T, typename key_compare> 
void quicksort(T first, T last, const size_t pivot_index, key_compare comp) { 
    T saved_first = first; 
    size_t N = last - first; 
    if (N > 1) { 
     // create a temp new array, which contains all items less than pivot 
     // on left and greater on right. With pivot value in between. 
     // vector<typename decltype(*T)> temp(N); 
     typename iterator_traits<T>::pointer temp = (typename iterator_traits<T>::pointer) malloc(sizeof(T)*N); 
     size_t left_index = 0, right_index = N - 1 ; 
     iterator_traits<T>::value_type pivot_val = *(first + pivot_index); 
     for(; first < saved_first + pivot_index; first++) { 
      if(!comp(*first, pivot_val)) 
       temp[right_index--] = *first; 
      else 
       temp[left_index++] = *first; 
     } 
     // skip the pivot value 
     // TODO: swap the pivot to end so we can have a single loop instead. 
     ++first; 
     // do the rest 
     for(; first < last; first++) { 
      if(!comp(*first, pivot_val)) 
       temp[right_index--] = *first; 
      else 
       temp[left_index++] = *first; 
     } 
     if(right_index == left_index) 
      temp[left_index] = pivot_val; 
     else 
      temp[left_index+1] = pivot_val; 
     // recurse for left and right.. 
     quicksort(temp, temp+left_index, left_index/2, comp); 
     quicksort(temp+left_index+1, temp+N, (N-right_index)/2, comp); 

     // return a concat'd array.. 
     for(size_t i = 0; i < N; i++) 
      *saved_first++ = temp[i]; 

     free(temp); 
    } 
} 
/* 
** best, average, worst: n log n, n log n, n^2 
** space: log n 
*/ 
template<typename T, typename key_compare > 
void quicksort(T first, T last, key_compare comp) { 
    size_t pivot_index = (last - first)/2; 
    quicksort(first, last, pivot_index, comp); 
} 

// ------------ QUICK with optimizations ------------------ 
/* 
quicksort partition on range [first, last[ using predicate function as the comparator. 
"mid" is in-out param, function uses mid as mid, and updates it before returning it with 
current/new mid position after partitioning. 
*/ 
template<typename T, typename key_compare > 
void _partial_quicksort_partition(T first, T last, T& mid, key_compare comp) { 
    T savedFirst = first; 
    typedef typename iterator_traits<T>::value_type _val_type; 
    size_t N = last - first; 
    _val_type *temp = (_val_type *) malloc((N*sizeof(_val_type))); 

    // move pivot to the end.. 
    _val_type pivot_val = *mid; 
    std::swap(*mid, *(last - 1)); 
    size_t left_index = 0, right_index = N - 1; 

    for(; first != last - 1; first++) { 
     if(!comp(*first, pivot_val)) 
      temp[right_index--] = *first; 
     else 
      temp[left_index++] = *first; 
    } 

    assert(right_index == left_index); 

    temp[left_index] = pivot_val; 

    std::copy(temp, temp+N, savedFirst); 
    free(temp); 
    mid = savedFirst + left_index; 
} 

template<typename T, typename key_compare > 
void _partial_quicksort(T first, T last, key_compare comp) { 
    size_t s = last - first; 
    // sort only if the list is smaller than our limit.. else it's too small for 
    // us to bother.. caller would take care of it using some other stupid algo.. 
    if(16 > s) { 
     // only one call to insertion_sort(), after whole array is partially sorted 
     // using quicksort(). 
     // my_insertion_sort::insertion_sort(first, last, pred); 
     return ; 
    } 

    // select pivot.. use median 3 
    T mid = my_mixed_inplace_quicksort::median3(first, last - 1, s, comp); 
    // partition 
    _partial_quicksort_partition(first, last, mid, comp); 
    // recurse.. 
    _partial_quicksort(first, mid, comp); 
    // tail recurse.. 
    // TODO: tail recurse on longer partition.. 
    _partial_quicksort(mid+1, last, comp); 
} 

template<typename T, typename key_compare > 
void mixed_quicksort(T first, T last, key_compare pred) { 
    _partial_quicksort(first, last, pred); 
    my_insertion_sort::insertion_sort(first, last, pred); 
} 

用於測試
// ---------------- INSERTION SORT used above ---------------- 
namespace my_insertion_sort { 
    template<typename T, typename key_compare> 
    void insertion_sort(T first, T last, key_compare comp) { 
     // for each element in the array [first+1, last[ 
     for(T j = first+1; j < last; j++) { 
      iterator_traits<T>::value_type curr = *j; 
      T hole = j; 
      // keep moving all the elements comp(hole.e. > or <) hole to right 
      while(hole > first && comp(curr, *(hole-1))) { 
       *hole = *(hole-1); 
       --hole; 
      } 
      // insert curr at the correct position. 
      *hole = curr; 
     } 
    } 
} 

代碼:

#include <ctime> 
#ifdef _WIN32 
#include <Windows.h> 
#include <WinBase.h> 
#endif // _WIN32 

template<typename T, typename key_compare = std::less<T>> class MySortAlgoTester; 

template <typename T> 
void print_array(T begin, T end, string prefix = "") { 
    cout << prefix.c_str(); 
    for_each(begin, end, [](typename std::iterator_traits<T>::reference it) { cout << it << ','; }); 
    cout << endl; 
} 

int main() { 
    srand(123456789L); 
    size_t numElements = 4; 
    vector<char*> algoNames; 
    map<double, vector<double>> results; 
    int numTests = 0; 
    while((numElements *= 2) <= 4096*16) { 
     MySortAlgoTester<int> tester(numElements); 
     results[numElements] = vector<double>(); 
     algoNames.push_back("mixedsort_inplace"); 
     results[numElements].push_back(tester.test(my_mixed_inplace_quicksort::mixedsort_inplace, "mixedsort_inplace")); 
     tester.reset(); 
     algoNames.push_back("quick"); 
     results[numElements].push_back(tester.test(my_quicksort::quicksort, "quicksort")); 
     tester.reset(); 
     algoNames.push_back("mixed_quicksort"); 
     results[numElements].push_back(tester.test(my_mixed_quicksort::mixed_quicksort, "mixed_quicksort")); 
    } 
    // --- print the results... 
    cout << std::setprecision(2) << std::fixed << endl << "N"; 
    for_each(algoNames.begin(), algoNames.begin()+(algoNames.size()/numTests), [](char* s){cout << ',' << s ;}); 

    typedef std::pair<double,vector<double>> result_iter; 
    BOOST_FOREACH(result_iter it, results) { 
     cout << endl << it.first << ','; 
    BOOST_FOREACH(double d, it.second) { 
     cout << d << ',' ; 
    } 
} 

template<typename T, typename key_compare = std::less<T>> 
class MySortAlgoTester { 
    key_compare comp; 
    vector<T> vec; 
    typedef typename vector<T>::iterator vecIter; 
    vector<T> vec_copy; 
    size_t m_numElements; 
    bool is_sorted(vecIter first, vecIter last) { 
     vecIter sFirst = first; 
     for(vecIter next = first+1; next != last;) 
      // '>' associativity: left to right 
      // ++ has precedence over > 
      if(!comp(*(first++), *(next++))) { 
       if(*(next-1) == *(first-1)) 
        continue; 
       print_array(sFirst, last, "is_sorted() returning false: "); 
       cout << "comp(" << *(first-1) << ", " << *(next-1) << ") = false && " 
        << *(next-1) << " != " << *(first-1) << endl ; 
       return false; 
      } 

      return true; 
    } 

public: 
    MySortAlgoTester(size_t numElements) : m_numElements(numElements) { 
     srand(123456789L); 
     vec.resize(m_numElements); 
     vec_copy.resize(m_numElements); 
     //  std::generate(vec.begin(), vec.end(), rand); 
     for(size_t i = 0; i < vec.size(); i++) { 
      vec[i] = rand() % (m_numElements * 2); 
      vec_copy[i] = vec[i]; 
     } 
    } 
    ~MySortAlgoTester() { 
    } 

    void reset() { 
     // copy the data back so next algo can be tested with same array. 
     std::copy(vec_copy.begin(), vec_copy.end(), vec.begin()); 
     for(size_t i = 0; i < vec_copy.size(); i++) { 
      vec[i] = vec_copy[i]; 
     } 
     // std::copy(vec_copy.begin(), vec_copy.end(), vec); 
    } 

    double m___start_time_asdfsa = 0; 
    double getTimeInMicroSecs() { 
    #ifdef _WIN32 
     LARGE_INTEGER li; 
     if(!QueryPerformanceFrequency(&li)) 
      cout << "getTimeInMicroSecs(): QueryPerformanceFrequency() failed!" << endl; 

     QueryPerformanceCounter(&li); 
     return double(li.QuadPart)/1000.0; 

    #else // _WIN32 
     struct timeval tv; 
     gettimeofday(&tv, NULL); 
     return tv.tv_usec + 10e6 * tv.tv_sec; 
    } 
    #endif // _WIN32 

    inline void printClock(const char* msg) { 
     cout << msg << (long)(getTimeInMicroSecs() - m___start_time_asdfsa) << " micro seconds" << endl; 
    } 
    inline double getClock() { 
     return (getTimeInMicroSecs() - m___start_time_asdfsa); 
    } 
    inline void startClock() { 
     m___start_time_asdfsa = getTimeInMicroSecs(); 
    } 

    double test(void (*sort_func)(typename vector<T>::iterator first, typename vector<T>::iterator last, typename key_compare pred), const char* name) { 
     cout << "START Testing: " << name << ". With --- " << m_numElements << " elements." << endl; 
     startClock(); 

     sort_func(vec.begin(), vec.end(), comp); 
     double ms = getClock(); 
     if(!MySortAlgoTester::is_sorted(vec.begin(), vec.end())) { 
      cout << name << " did not sort the array." << endl; 
      // throw string(name) + " did not sort the array."; 
     } 
     cout << "DONE Testing: " << name << ". Time taken (ms): " << ms << endl; 
     return ms; 
    } 

    double test(void (*sort_func)(typename vector<T>::iterator first, typename vector<T>::iterator last), const char* name) { 
     cout << "START Testing: " << name << ". With --- " << m_numElements << " elements." << endl; 
     startClock(); 

     sort_func(vec.begin(), vec.end()); 
     double ms = getClock(); 
     if(!MySortAlgoTester::is_sorted(vec.begin(), vec.end())) { 
      cout << name << " did not sort the array." << endl; 
      // throw string(name) + " did not sort the array."; 
     } 
     cout << "DONE Testing: " << name << ". Time taken (ms): " << ms << endl; 
     return ms; 
    } 
}; 
+0

好的,等待實際的基準測試者 – sehe

+0

我不知道這是否會有幫助,但請觀看:https://www.youtube.com/watch?v=aMnn0Jq0J-E該視頻名爲'Three Beautiful Quicksorts'。你會喜歡它的(這是一個技術性的討論)。 – Paul

+0

*排序算法* - >即時「我甚至不打算讀你的問題。」嚴重的是,算法*,五個額外的字符,太難以鍵入? – Drise

回答

0

因此,最後我想我至少弄清楚了錯誤的一部分。

感謝sehe的提示。

  • 當與-O3(上GCC或/Ox上MSVC)編譯mixed_inplace是最快和相當接近std::sort()
    • 我想這意味着至少一些預期的優化的(尾遞歸)在較低優化級別編譯時未由編譯器應用。
  • 構建應該是發佈版本(不包括在GCC上的-g)。
  • @sehe:插入排序性能無關緊要。
  • std::sort()在GCC和MSVC上的實現是不同的,所以比較兩者並不是真的。

以下是Windows上的結果,並與Linux和W/O優化選項:

窗戶用MSVC: Windows with MSVC

窗戶用GCC: enter image description here

RedHat Linux上用GCC : RHEL with GCC

+0

儘管實現可能有所不同,但我認爲比較MSVC/GCC上的std :: sort是完全有效的(這好像比較不同CPU之間的性能以比較實現CPUs _for具體任務_) – sehe

+0

@sehe哲學上:是的。 :-) ..但我們正在討論abt實際的實現和內部優化.. ps:我的'mixed_inplace_quicksort'是基於MSVC的實現..現在已經添加了帶有GCC結果的Windows .. – Kashyap

5

你必須在算法本身的錯誤。例如。有未定義的行爲這裏:

插入排序潛在讀出的越界在標記的下面的行:

*(hole--) = *(hole-1); 

它的第一個元素之前讀取。我建議你的意思

*hole = *(hole-1); 
--hole; 

更新迅速在64位的Linux和GNU G ++ 4.6.1基準。我重新安排時間總計,所以我不必重新實現時鐘功能(我很懶)。

改編代碼:http://ideone.com/LgAgs

建有

g++ -std=c++0x -g -O3 test.cpp -o test 

這裏的結果是:插入排序似乎是大致〜60-100x慢比其他人。

START Testing: insertion_sort. With --- 8 elements. 
START Testing: insertion_sort. With --- 16 elements. 
START Testing: insertion_sort. With --- 32 elements. 
START Testing: insertion_sort. With --- 64 elements. 
START Testing: insertion_sort. With --- 128 elements. 
START Testing: insertion_sort. With --- 256 elements. 
START Testing: insertion_sort. With --- 512 elements. 
START Testing: insertion_sort. With --- 1024 elements. 
START Testing: insertion_sort. With --- 2048 elements. 
START Testing: insertion_sort. With --- 4096 elements. 
START Testing: insertion_sort. With --- 8192 elements. 
START Testing: insertion_sort. With --- 16384 elements. 
START Testing: insertion_sort. With --- 32768 elements. 
START Testing: insertion_sort. With --- 65536 elements. 

real 0m1.532s 
user 0m1.524s 
sys 0m0.004s 
START Testing: quicksort. With --- 8 elements. 
START Testing: quicksort. With --- 16 elements. 
START Testing: quicksort. With --- 32 elements. 
START Testing: quicksort. With --- 64 elements. 
START Testing: quicksort. With --- 128 elements. 
START Testing: quicksort. With --- 256 elements. 
START Testing: quicksort. With --- 512 elements. 
START Testing: quicksort. With --- 1024 elements. 
START Testing: quicksort. With --- 2048 elements. 
START Testing: quicksort. With --- 4096 elements. 
START Testing: quicksort. With --- 8192 elements. 
START Testing: quicksort. With --- 16384 elements. 
START Testing: quicksort. With --- 32768 elements. 
START Testing: quicksort. With --- 65536 elements. 

real 0m0.025s 
user 0m0.016s 
sys 0m0.008s 
START Testing: mixed_quicksort. With --- 8 elements. 
START Testing: mixed_quicksort. With --- 16 elements. 
START Testing: mixed_quicksort. With --- 32 elements. 
START Testing: mixed_quicksort. With --- 64 elements. 
START Testing: mixed_quicksort. With --- 128 elements. 
START Testing: mixed_quicksort. With --- 256 elements. 
START Testing: mixed_quicksort. With --- 512 elements. 
START Testing: mixed_quicksort. With --- 1024 elements. 
START Testing: mixed_quicksort. With --- 2048 elements. 
START Testing: mixed_quicksort. With --- 4096 elements. 
START Testing: mixed_quicksort. With --- 8192 elements. 
START Testing: mixed_quicksort. With --- 16384 elements. 
START Testing: mixed_quicksort. With --- 32768 elements. 
START Testing: mixed_quicksort. With --- 65536 elements. 

real 0m0.016s 
user 0m0.004s 
sys 0m0.008s 
+0

排序:矢量,是的,數據「完全相同」。我將用測試代碼更新這篇文章。 – Kashyap

+0

在我的系統中添加了最簡單的時間。希望幫助 – sehe

+0

operator ='的結合性是左對齊的,所以在執行'*(hole - 1)'之後調用'operator --'。並且'*(hole--)= *(hole-1);'只在'hole> first'時執行'。所以它應該工作。我錯過了什麼? – Kashyap