我需要自動完成phrases。例如，當我搜索「癡呆症alz」，我想得到「癡呆症在阿爾茨海默氏症」。帶詞組匹配的Edge NGram

爲此，我配置了Edge NGram tokenizer。我在查詢主體中嘗試使用edge_ngram_analyzer和standard作爲分析器。儘管如此，當我嘗試匹配一個短語時，我無法得到結果。

我在做什麼錯？

我的查詢：

{ 
    "query":{ 
    "multi_match":{ 
     "query":"dementia in alz", 
     "type":"phrase", 
     "analyzer":"edge_ngram_analyzer", 
     "fields":["_all"] 
    } 
    } 
}

我的映射：

... 
"type" : { 
    "_all" : { 
    "analyzer" : "edge_ngram_analyzer", 
    "search_analyzer" : "standard" 
    }, 
    "properties" : { 
    "field" : { 
     "type" : "string", 
     "analyzer" : "edge_ngram_analyzer", 
     "search_analyzer" : "standard" 
    }, 
... 
"settings" : { 
    ... 
    "analysis" : { 
    "filter" : { 
     "stem_possessive_filter" : { 
     "name" : "possessive_english", 
     "type" : "stemmer" 
     } 
    }, 
    "analyzer" : { 
     "edge_ngram_analyzer" : { 
     "filter" : [ "lowercase" ], 
     "tokenizer" : "edge_ngram_tokenizer" 
     } 
    }, 
    "tokenizer" : { 
     "edge_ngram_tokenizer" : { 
     "token_chars" : [ "letter", "digit", "whitespace" ], 
     "min_gram" : "2", 
     "type" : "edgeNGram", 
     "max_gram" : "25" 
     } 
    } 
    } 
    ...

我的文檔：

{ 
    "_score": 1.1152233, 
    "_type": "Diagnosis", 
    "_id": "AVZLfHfBE5CzEm8aJ3Xp", 
    "_source": { 
    "@timestamp": "2016-08-02T13:40:48.665Z", 
    "type": "Diagnosis", 
    "Document_ID": "Diagnosis_1400541", 
    "Diagnosis": "F00.0 - Dementia in Alzheimer's disease with early onset", 
    "@version": "1", 
    }, 
    "_index": "carenotes" 
}, 
{ 
    "_score": 1.1152233, 
    "_type": "Diagnosis", 
    "_id": "AVZLfICrE5CzEm8aJ4Dc", 
    "_source": { 
    "@timestamp": "2016-08-02T13:40:51.240Z", 
    "type": "Diagnosis", 
    "Document_ID": "Diagnosis_1424351", 
    "Diagnosis": "F00.1 - Dementia in Alzheimer's disease with late onset", 
    "@version": "1", 
    }, 
    "_index": "carenotes" 
}

的「阿爾茨海默癡呆症」的分析短語：

{ 
    "tokens": [ 
    { 
     "end_offset": 2, 
     "token": "de", 
     "type": "word", 
     "start_offset": 0, 
     "position": 0 
    }, 
    { 
     "end_offset": 3, 
     "token": "dem", 
     "type": "word", 
     "start_offset": 0, 
     "position": 1 
    }, 
    { 
     "end_offset": 4, 
     "token": "deme", 
     "type": "word", 
     "start_offset": 0, 
     "position": 2 
    }, 
    { 
     "end_offset": 5, 
     "token": "demen", 
     "type": "word", 
     "start_offset": 0, 
     "position": 3 
    }, 
    { 
     "end_offset": 6, 
     "token": "dement", 
     "type": "word", 
     "start_offset": 0, 
     "position": 4 
    }, 
    { 
     "end_offset": 7, 
     "token": "dementi", 
     "type": "word", 
     "start_offset": 0, 
     "position": 5 
    }, 
    { 
     "end_offset": 8, 
     "token": "dementia", 
     "type": "word", 
     "start_offset": 0, 
     "position": 6 
    }, 
    { 
     "end_offset": 9, 
     "token": "dementia ", 
     "type": "word", 
     "start_offset": 0, 
     "position": 7 
    }, 
    { 
     "end_offset": 10, 
     "token": "dementia i", 
     "type": "word", 
     "start_offset": 0, 
     "position": 8 
    }, 
    { 
     "end_offset": 11, 
     "token": "dementia in", 
     "type": "word", 
     "start_offset": 0, 
     "position": 9 
    }, 
    { 
     "end_offset": 12, 
     "token": "dementia in ", 
     "type": "word", 
     "start_offset": 0, 
     "position": 10 
    }, 
    { 
     "end_offset": 13, 
     "token": "dementia in a", 
     "type": "word", 
     "start_offset": 0, 
     "position": 11 
    }, 
    { 
     "end_offset": 14, 
     "token": "dementia in al", 
     "type": "word", 
     "start_offset": 0, 
     "position": 12 
    }, 
    { 
     "end_offset": 15, 
     "token": "dementia in alz", 
     "type": "word", 
     "start_offset": 0, 
     "position": 13 
    }, 
    { 
     "end_offset": 16, 
     "token": "dementia in alzh", 
     "type": "word", 
     "start_offset": 0, 
     "position": 14 
    }, 
    { 
     "end_offset": 17, 
     "token": "dementia in alzhe", 
     "type": "word", 
     "start_offset": 0, 
     "position": 15 
    }, 
    { 
     "end_offset": 18, 
     "token": "dementia in alzhei", 
     "type": "word", 
     "start_offset": 0, 
     "position": 16 
    }, 
    { 
     "end_offset": 19, 
     "token": "dementia in alzheim", 
     "type": "word", 
     "start_offset": 0, 
     "position": 17 
    }, 
    { 
     "end_offset": 20, 
     "token": "dementia in alzheime", 
     "type": "word", 
     "start_offset": 0, 
     "position": 18 
    }, 
    { 
     "end_offset": 21, 
     "token": "dementia in alzheimer", 
     "type": "word", 
     "start_offset": 0, 
     "position": 19 
    } 
    ] 
}

來源

2016-08-09 trex

您是否嘗試使用query_string而不是multi_match？請讓我知道它是否能解決您的問題。 –

'query_string'默認在'_all'字段中搜索。所以，它和我在'multi_match'和'「fields」中做的一樣：[「_all」]'。儘管如此，我試過了，沒有成功。我使用了下面的查詢：{'query'：{'query_string'：{'query'：'alme'，'phrase_slop'中的癡呆症：0}}}'' – trex

非常感謝rendel誰幫我找到合適的解決方案！

Andrei Stefan的解決方案不是最優的。

爲什麼？首先，搜索分析器中缺少小寫字母過濾器會使搜索不便;該案件必須嚴格配合。需要使用lowercase過濾器的自定義分析器，而不是"analyzer": "keyword"。

二，分析部分錯誤！在索引期間，通過edge_ngram_analyzer分析字符串「F00.0-阿爾茨海默病癡呆早期」。有了這個分析，我們有如下的字典作爲分析的字符串數組：

{ 
    "tokens": [ 
    { 
     "end_offset": 2, 
     "token": "f0", 
     "type": "word", 
     "start_offset": 0, 
     "position": 0 
    }, 
    { 
     "end_offset": 3, 
     "token": "f00", 
     "type": "word", 
     "start_offset": 0, 
     "position": 1 
    }, 
    { 
     "end_offset": 6, 
     "token": "0 ", 
     "type": "word", 
     "start_offset": 4, 
     "position": 2 
    }, 
    { 
     "end_offset": 9, 
     "token": " ", 
     "type": "word", 
     "start_offset": 7, 
     "position": 3 
    }, 
    { 
     "end_offset": 10, 
     "token": " d", 
     "type": "word", 
     "start_offset": 7, 
     "position": 4 
    }, 
    { 
     "end_offset": 11, 
     "token": " de", 
     "type": "word", 
     "start_offset": 7, 
     "position": 5 
    }, 
    { 
     "end_offset": 12, 
     "token": " dem", 
     "type": "word", 
     "start_offset": 7, 
     "position": 6 
    }, 
    { 
     "end_offset": 13, 
     "token": " deme", 
     "type": "word", 
     "start_offset": 7, 
     "position": 7 
    }, 
    { 
     "end_offset": 14, 
     "token": " demen", 
     "type": "word", 
     "start_offset": 7, 
     "position": 8 
    }, 
    { 
     "end_offset": 15, 
     "token": " dement", 
     "type": "word", 
     "start_offset": 7, 
     "position": 9 
    }, 
    { 
     "end_offset": 16, 
     "token": " dementi", 
     "type": "word", 
     "start_offset": 7, 
     "position": 10 
    }, 
    { 
     "end_offset": 17, 
     "token": " dementia", 
     "type": "word", 
     "start_offset": 7, 
     "position": 11 
    }, 
    { 
     "end_offset": 18, 
     "token": " dementia ", 
     "type": "word", 
     "start_offset": 7, 
     "position": 12 
    }, 
    { 
     "end_offset": 19, 
     "token": " dementia i", 
     "type": "word", 
     "start_offset": 7, 
     "position": 13 
    }, 
    { 
     "end_offset": 20, 
     "token": " dementia in", 
     "type": "word", 
     "start_offset": 7, 
     "position": 14 
    }, 
    { 
     "end_offset": 21, 
     "token": " dementia in ", 
     "type": "word", 
     "start_offset": 7, 
     "position": 15 
    }, 
    { 
     "end_offset": 22, 
     "token": " dementia in a", 
     "type": "word", 
     "start_offset": 7, 
     "position": 16 
    }, 
    { 
     "end_offset": 23, 
     "token": " dementia in al", 
     "type": "word", 
     "start_offset": 7, 
     "position": 17 
    }, 
    { 
     "end_offset": 24, 
     "token": " dementia in alz", 
     "type": "word", 
     "start_offset": 7, 
     "position": 18 
    }, 
    { 
     "end_offset": 25, 
     "token": " dementia in alzh", 
     "type": "word", 
     "start_offset": 7, 
     "position": 19 
    }, 
    { 
     "end_offset": 26, 
     "token": " dementia in alzhe", 
     "type": "word", 
     "start_offset": 7, 
     "position": 20 
    }, 
    { 
     "end_offset": 27, 
     "token": " dementia in alzhei", 
     "type": "word", 
     "start_offset": 7, 
     "position": 21 
    }, 
    { 
     "end_offset": 28, 
     "token": " dementia in alzheim", 
     "type": "word", 
     "start_offset": 7, 
     "position": 22 
    }, 
    { 
     "end_offset": 29, 
     "token": " dementia in alzheime", 
     "type": "word", 
     "start_offset": 7, 
     "position": 23 
    }, 
    { 
     "end_offset": 30, 
     "token": " dementia in alzheimer", 
     "type": "word", 
     "start_offset": 7, 
     "position": 24 
    }, 
    { 
     "end_offset": 33, 
     "token": "s ", 
     "type": "word", 
     "start_offset": 31, 
     "position": 25 
    }, 
    { 
     "end_offset": 34, 
     "token": "s d", 
     "type": "word", 
     "start_offset": 31, 
     "position": 26 
    }, 
    { 
     "end_offset": 35, 
     "token": "s di", 
     "type": "word", 
     "start_offset": 31, 
     "position": 27 
    }, 
    { 
     "end_offset": 36, 
     "token": "s dis", 
     "type": "word", 
     "start_offset": 31, 
     "position": 28 
    }, 
    { 
     "end_offset": 37, 
     "token": "s dise", 
     "type": "word", 
     "start_offset": 31, 
     "position": 29 
    }, 
    { 
     "end_offset": 38, 
     "token": "s disea", 
     "type": "word", 
     "start_offset": 31, 
     "position": 30 
    }, 
    { 
     "end_offset": 39, 
     "token": "s diseas", 
     "type": "word", 
     "start_offset": 31, 
     "position": 31 
    }, 
    { 
     "end_offset": 40, 
     "token": "s disease", 
     "type": "word", 
     "start_offset": 31, 
     "position": 32 
    }, 
    { 
     "end_offset": 41, 
     "token": "s disease ", 
     "type": "word", 
     "start_offset": 31, 
     "position": 33 
    }, 
    { 
     "end_offset": 42, 
     "token": "s disease w", 
     "type": "word", 
     "start_offset": 31, 
     "position": 34 
    }, 
    { 
     "end_offset": 43, 
     "token": "s disease wi", 
     "type": "word", 
     "start_offset": 31, 
     "position": 35 
    }, 
    { 
     "end_offset": 44, 
     "token": "s disease wit", 
     "type": "word", 
     "start_offset": 31, 
     "position": 36 
    }, 
    { 
     "end_offset": 45, 
     "token": "s disease with", 
     "type": "word", 
     "start_offset": 31, 
     "position": 37 
    }, 
    { 
     "end_offset": 46, 
     "token": "s disease with ", 
     "type": "word", 
     "start_offset": 31, 
     "position": 38 
    }, 
    { 
     "end_offset": 47, 
     "token": "s disease with e", 
     "type": "word", 
     "start_offset": 31, 
     "position": 39 
    }, 
    { 
     "end_offset": 48, 
     "token": "s disease with ea", 
     "type": "word", 
     "start_offset": 31, 
     "position": 40 
    }, 
    { 
     "end_offset": 49, 
     "token": "s disease with ear", 
     "type": "word", 
     "start_offset": 31, 
     "position": 41 
    }, 
    { 
     "end_offset": 50, 
     "token": "s disease with earl", 
     "type": "word", 
     "start_offset": 31, 
     "position": 42 
    }, 
    { 
     "end_offset": 51, 
     "token": "s disease with early", 
     "type": "word", 
     "start_offset": 31, 
     "position": 43 
    }, 
    { 
     "end_offset": 52, 
     "token": "s disease with early ", 
     "type": "word", 
     "start_offset": 31, 
     "position": 44 
    }, 
    { 
     "end_offset": 53, 
     "token": "s disease with early o", 
     "type": "word", 
     "start_offset": 31, 
     "position": 45 
    }, 
    { 
     "end_offset": 54, 
     "token": "s disease with early on", 
     "type": "word", 
     "start_offset": 31, 
     "position": 46 
    }, 
    { 
     "end_offset": 55, 
     "token": "s disease with early ons", 
     "type": "word", 
     "start_offset": 31, 
     "position": 47 
    }, 
    { 
     "end_offset": 56, 
     "token": "s disease with early onse", 
     "type": "word", 
     "start_offset": 31, 
     "position": 48 
    } 
    ] 
}

正如你所看到的，整個字符串2至25個字符與令牌大小記號化。字符串以線性方式進行標記，所有空格和位置爲每個新標記遞增1。

有幾個問題與它：

的edge_ngram_analyzer產生無用令牌將永遠不會被搜索的，例如：「」，「」，「d」，「 SD」，「W病」等

此外，它沒有產生很多有用的令牌可以使用，例如：「疾病」，「提前發生」等。如果您嘗試搜索這些單詞中的任何一個，則會有0個結果。

注意，最後一個標記是「s疾病早期」。最後的「t」在哪裏？由於"max_gram" : "25"我們「失去了」在所有領域的一些文字。您無法再搜索此文本，因爲它沒有令牌。

trim過濾器只會混淆過濾額外空格時可能會由標記器完成的問題。

edge_ngram_analyzer遞增每個令牌的位置，這對於諸如短語查詢的位置查詢是有問題的。應該使用edge_ngram_filter而不是在生成ngrams時保留標記的位置。

最佳解決方案。

的映射設置，以使用：

... "mappings": { "Type": { "_all":{ "analyzer": "edge_ngram_analyzer", "search_analyzer": "keyword_analyzer" }, "properties": { "Field": { "search_analyzer": "keyword_analyzer", "type": "string", "analyzer": "edge_ngram_analyzer" }, ... ... "settings": { "analysis": { "filter": { "english_poss_stemmer": { "type": "stemmer", "name": "possessive_english" }, "edge_ngram": { "type": "edgeNGram", "min_gram": "2", "max_gram": "25", "token_chars": ["letter", "digit"] } }, "analyzer": { "edge_ngram_analyzer": { "filter": ["lowercase", "english_poss_stemmer", "edge_ngram"], "tokenizer": "standard" }, "keyword_analyzer": { "filter": ["lowercase", "english_poss_stemmer"], "tokenizer": "standard" } } } } ...

看分析：

{ "tokens": [ { "end_offset": 5, "token": "f0", "type": "word", "start_offset": 0, "position": 0 }, { "end_offset": 5, "token": "f00", "type": "word", "start_offset": 0, "position": 0 }, { "end_offset": 5, "token": "f00.", "type": "word", "start_offset": 0, "position": 0 }, { "end_offset": 5, "token": "f00.0", "type": "word", "start_offset": 0, "position": 0 }, { "end_offset": 17, "token": "de", "type": "word", "start_offset": 9, "position": 2 }, { "end_offset": 17, "token": "dem", "type": "word", "start_offset": 9, "position": 2 }, { "end_offset": 17, "token": "deme", "type": "word", "start_offset": 9, "position": 2 }, { "end_offset": 17, "token": "demen", "type": "word", "start_offset": 9, "position": 2 }, { "end_offset": 17, "token": "dement", "type": "word", "start_offset": 9, "position": 2 }, { "end_offset": 17, "token": "dementi", "type": "word", "start_offset": 9, "position": 2 }, { "end_offset": 17, "token": "dementia", "type": "word", "start_offset": 9, "position": 2 }, { "end_offset": 20, "token": "in", "type": "word", "start_offset": 18, "position": 3 }, { "end_offset": 32, "token": "al", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 32, "token": "alz", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 32, "token": "alzh", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 32, "token": "alzhe", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 32, "token": "alzhei", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 32, "token": "alzheim", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 32, "token": "alzheime", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 32, "token": "alzheimer", "type": "word", "start_offset": 21, "position": 4 }, { "end_offset": 40, "token": "di", "type": "word", "start_offset": 33, "position": 5 }, { "end_offset": 40, "token": "dis", "type": "word", "start_offset": 33, "position": 5 }, { "end_offset": 40, "token": "dise", "type": "word", "start_offset": 33, "position": 5 }, { "end_offset": 40, "token": "disea", "type": "word", "start_offset": 33, "position": 5 }, { "end_offset": 40, "token": "diseas", "type": "word", "start_offset": 33, "position": 5 }, { "end_offset": 40, "token": "disease", "type": "word", "start_offset": 33, "position": 5 }, { "end_offset": 45, "token": "wi", "type": "word", "start_offset": 41, "position": 6 }, { "end_offset": 45, "token": "wit", "type": "word", "start_offset": 41, "position": 6 }, { "end_offset": 45, "token": "with", "type": "word", "start_offset": 41, "position": 6 }, { "end_offset": 51, "token": "ea", "type": "word", "start_offset": 46, "position": 7 }, { "end_offset": 51, "token": "ear", "type": "word", "start_offset": 46, "position": 7 }, { "end_offset": 51, "token": "earl", "type": "word", "start_offset": 46, "position": 7 }, { "end_offset": 51, "token": "early", "type": "word", "start_offset": 46, "position": 7 }, { "end_offset": 57, "token": "on", "type": "word", "start_offset": 52, "position": 8 }, { "end_offset": 57, "token": "ons", "type": "word", "start_offset": 52, "position": 8 }, { "end_offset": 57, "token": "onse", "type": "word", "start_offset": 52, "position": 8 }, { "end_offset": 57, "token": "onset", "type": "word", "start_offset": 52, "position": 8 } ] }

在索引時間文本由standard標記生成器標記化，則分開的字由lowercase過濾，possessive_english和edge_ngram過濾器。 令牌只能用於單詞。在搜索時間文本被標記爲standard標記化器，然後單獨的單詞被lowercase和possessive_english篩選。搜索到的單詞與索引時間期間創建的令牌相匹配。

因此，我們使增量搜索成爲可能！

現在，因爲我們在不同的話統計ngram，我們甚至可以執行查詢，如

{ 'query': { 'multi_match': { 'query': 'dem in alzh', 'type': 'phrase', 'fields': ['_all'] } } }

，並得到正確的結果。

沒有文字「丟失」，一切都可以搜索，並且沒有必要通過trim篩選來處理空格。

來源

2016-08-11 10:34:13 trex

我不會有時間來提供這樣一個精心設計的解決方案，但是我很感謝您花時間來報告您的發現。至少，我能夠幫助你發現一個最初的問題。乾杯! –

非常感謝@trex，我有相同的要求，只是設置了這個方法。 –

映射大括號中有一個語法問題，解決方案不適合我們？ – tina

我相信你的查詢是錯誤的：當你在索引時需要nGrams，你不需要它們在搜索時。在搜索時，您需要將文本儘可能「固定」。試試這個查詢，而不是：

{ 
    "query": { 
    "multi_match": { 
     "query": " dementia in alz", 
     "analyzer": "keyword", 
     "fields": [ 
     "_all" 
     ] 
    } 
    } 
}

你dementia前發現兩個空格。這些由您的分析儀從文本中解釋。爲了擺脫那些你需要的trim token_filter：

"edge_ngram_analyzer": { 
     "filter": [ 
     "lowercase","trim" 
     ], 
     "tokenizer": "edge_ngram_tokenizer" 
    }

然後這個查詢就可以了（dementia前無空格）：

{ 
    "query": { 
    "multi_match": { 
     "query": "dementia in alz", 
     "analyzer": "keyword", 
     "fields": [ 
     "_all" 
     ] 
    } 
    } 
}

來源

2016-08-09 14:16:11

我剛剛試過了，0結果。 – trex

另外，我用'lowercase'過濾器重複測試，將以下分析器添加到現有映射「keyword_analyzer」：{「filter」：[「lowercase」]，「tokenizer」：「keyword」}'。查詢是'{'query'：{'multi_match'：{'query'：'alz'中的癡呆症'，'analyser'：'keyword_analyzer'，'fields'：['_all']}}}'。沒有成功:-( – trex

我需要查看完整的文檔以測試索引的** complete **映射。請使用github –

帶詞組匹配的Edge NGram

回答

Andrei Stefan的解決方案不是最優的。

最佳解決方案。

相關問題