2013-05-06 32 views
2

我有一個帶有title字段的PostgreSQL表,但這些標題通常包含「The」或「An」在前面,我需要一種按字母順序排序這些記錄的方法,就像圖書館會在排序時忽略這些文章一樣。如何按照標題的字母順序排序(忽略The,An等)並使用索引

兩個問題

  1. 什麼是寫這個ORDER BY表達SQL的最佳方式?

  2. 如何在標題字段上構建和使用適當的索引而不將標題字段值的子字符串複製到「alphabetical_title」字段和索引中?

我正在尋找適合PostgreSQL的解決方案。謝謝。

回答

3

你可以在一個表達式添加索引:然後

create index on yourtable (natural_sort(title)); 

Postgres將在適當的時候使用索引,並不會實際計算natural_sort(title)當它 - 除非你選擇了。

這就是說(與tsvector字段非常相似),如果您實際存儲因性能原因而預先計算的結果,您將獲得改進的性能。如果在上面的例子中,Postgres決定不使用該索引,需要爲所考慮的每一行實際計算它,這會對查詢產生很大的阻礙。

在任何情況下,不要忘記數字:

http://www.codinghorror.com/blog/2007/12/sorting-for-humans-natural-sort-order.html


這裏有兩個功能,讓你開始自然排序:

/** 
* @param text _str The input string. 
* @return text The output string for consumption in natural sorting. 
*/ 
CREATE OR REPLACE FUNCTION natsort(text) 
    RETURNS text 
AS $$ 
DECLARE 
    _str text := $1; 
    _pad int := 15; -- Maximum precision for PostgreSQL floats 
BEGIN 
    -- Bail if the string is empty 
    IF trim(_str) = '' 
    THEN 
     RETURN ''; 
    END IF; 

    -- Strip accents and lower the case 
    _str := lower(unaccent(_str)); 

    -- Replace nonsensical characters 
    _str := regexp_replace(_str, E'[^a-z0-9$¢£¥₤€@&%\\(\\)\\[\\]\\{\\}_:;,\\.\\?!\\+\\-]+', ' ', 'g'); 

    -- Trim the result 
    _str := trim(_str); 

    -- @todo we'd ideally want to strip leading articles/prepositions ('a', 'the') at this stage, 
    --  but to_tsvector()'s default dictionary also strips stop words (e.g. 'all'). 

    -- We're done if the string contains no numbers 
    IF _str !~ '[0-9]' 
    THEN 
     RETURN _str; 
    END IF; 

    -- Force spaces between numbers, so we can use regexp_split_to_table() 
    _str := regexp_replace(_str, E'((?:[0-9]+|[0-9]*\\.[0-9]+)(?:e[+-]?[0-9]+\\M)?)', E' \\1 ', 'g'); 

    -- Pad zeros to obtain a reasonably natural looking sort order 
    RETURN array_to_string(ARRAY(
    SELECT CASE 
      WHEN val !~ E'^\\.?[0-9]' 
      -- Not a number; return as is 
      THEN val 
      -- Do our best after expanding the number... 
      ELSE COALESCE(lpad(substring(val::numeric::text from '^[0-9]+'), _pad, '0'), '') || 
       COALESCE(rpad(substring(val::numeric::text from E'\\.[0-9]+'), _pad, '0'), '') 
      END 
    FROM regexp_split_to_table(_str, E'\\s+') as val 
    WHERE val <> '' 
    ), ' '); 
END; 
$$ IMMUTABLE STRICT LANGUAGE plpgsql COST 1; 

COMMENT ON FUNCTION natsort(text) IS 
'Rewrites a string so it can be used in natural sorting. 

It''s by no means bullet proof, but it works properly for positive integers, 
reasonably well for positive floats, and it''s fast enough to be used in a 
trigger that populates an indexed column, or in an index directly.'; 

/** 
* @param text[] _values The potential values to use. 
* @return text The output string for consumption in natural sorting. 
*/ 
CREATE OR REPLACE FUNCTION sort(text[]) 
    RETURNS text 
AS $$ 
DECLARE 
    _values  alias for $1; 
    _sort  text; 
BEGIN 
    SELECT natsort(value) 
    INTO _sort 
    FROM unnest(_values) as value 
    WHERE value IS NOT NULL 
    AND  value <> '' 
    AND  natsort(value) <> '' 
    LIMIT 1; 

    RETURN COALESCE(_sort, ''); 
END; 
$$ IMMUTABLE STRICT LANGUAGE plpgsql COST 1; 

COMMENT ON FUNCTION sort(text[]) IS 
'Returns natsort() of the first significant input argument.'; 

樣本輸出從第一功能的單元測試:

public function testNatsort() 
{ 
    $this->checkInOut('natsort', array(
     '<NULL>'    => null, 
     ''      => '', 
     'ABCde'     => 'abcde', 
     '12345 12345'   => '000000000', 
     '12345.12345'   => '000000000.123450000000000', 
     '12345e5'    => '0000', 
     '.12345e5'    => '000000000', 
     '1e10'     => '000010000000000', 
     '1.2e20'    => '120000000000000', 
     '-12345e5'    => '- 0000', 
     '-.12345e5'    => '- 000000000', 
     '-1e10'     => '- 000010000000000', 
     '-1.2e20'    => '- 120000000000000', 
     '+-$¢£¥₤€@&%'   => '+-$¢£¥₤€@&%', 
     'ÀÁÂÃÄÅĀĄĂÆ'   => 'aaaaaeaaaaaae', 
     'ÈÉÊËĒĘĚĔĖÐ'   => 'eeeeeeeeee', 
     'ÌÍÎÏĪĨĬĮİIJ'   => 'iiiiiiiiiij', 
     'ÒÓÔÕÖØŌŐŎŒ'   => 'oooooeoooooe', 
     'ÙÚÛÜŪŮŰŬŨŲ'   => 'uuuueuuuuuu', 
     'ÝŶŸ'     => 'yyy', 
     'àáâãäåāąăæ'   => 'aaaaaeaaaaaae', 
     'èéêëēęěĕėð'   => 'eeeeeeeeee', 
     'ìíîïīĩĭįıij'   => 'iiiiiiiiiij', 
     'òóôõöøōőŏœ'   => 'oooooeoooooe', 
     'ùúûüūůűŭũų'   => 'uuuueuuuuuu', 
     'ýÿŷ'     => 'yyy', 
     'ÇĆČĈĊ'     => 'ccccc', 
     'ĎĐ'     => 'dd', 
     'Ƒ'      => 'f', 
     'ĜĞĠĢ'     => 'gggg', 
     'ĤĦ'     => 'hh', 
     'Ĵ'      => 'j', 
     'Ķ'      => 'k', 
     'ŁĽĹĻĿ'     => 'lllll', 
     'ÑŃŇŅŊ'     => 'nnnnn', 
     'ŔŘŖ'     => 'rrr', 
     'ŚŠŞŜȘſ'    => 'sssssss', 
     'ŤŢŦȚÞ'     => 'ttttt', 
     'Ŵ'      => 'w', 
     'ŹŽŻ'     => 'zzz', 
     'çćčĉċ'     => 'ccccc', 
     'ďđ'     => 'dd', 
     'ƒ'      => 'f', 
     'ĝğġģ'     => 'gggg', 
     'ĥħ'     => 'hh', 
     'ĵ'      => 'j', 
     'ĸķ'     => 'kk', 
     'łľĺļŀ'     => 'lllll', 
     'ñńňņʼnŋ'    => 'nnnnnn', 
     'ŕřŗ'     => 'rrr', 
     'śšşŝșß'    => 'sssssss', 
     'ťţŧțþ'     => 'ttttt', 
     'ŵ'      => 'w', 
     'žżź'     => 'zzz', 
     '-_aaa--zzz--'   => '-_aaa--zzz--', 
     '-:àáâ;-žżź--'   => '-:aaa;-zzz--', 
     '-.à$â,-ž%ź--'   => '-.a$a,-z%z--', 
     '--à$â--ž%ź--'   => '--a$a--z%z--', 
     '-$à(â--ž)ź%-'   => '-$a(a--z)z%-', 
     '#-à$â--ž?!ź-'   => '-a$a--z?!z-', 
    )); 
相關問題