2016-08-03 141 views
0

我無法弄清楚如何查找帶有特殊字符的單詞。ElasticSearch和特殊字符

例如,我有兩個文件:

1)我們正在尋找C++和C#開發人員
2)我們正在尋找C語言開發

我想才發現其中包含C++文件。

代碼創建索引,文件和搜索:

from elasticsearch import Elasticsearch 
from elasticsearch.helpers import scan 

ELASTIC_SEARCH_NODES = ['http://localhost:9200'] 

INDEX = 'my_index' 
DOC_TYPE = 'material' 


def create_index(): 
    data = { 
     "settings": { 
      "analysis": { 
      "analyzer": { 
       "my_analyzer": { 
        "type": "custom", 
        "filter": [ 
         "lowercase" 
        ], 
        "tokenizer": "whitespace", 
       } 
      } 
      } 
     } 
    } 

    print es_client.indices.create(index=INDEX, body=data) 


def create_doc(body): 

    if es_client.exists(INDEX, DOC_TYPE, body['docid']): 
     es_client.delete(INDEX, DOC_TYPE, body['docid']) 

    print es_client.create(index=INDEX, doc_type=DOC_TYPE, body=body, id=body['docid']) 


def find_doc(value): 
    results_generator = scan(es_client, 
      query={"query": { 

        "match_phrase" : { 
         "text" : value 
        } 

      }}, 
      index=INDEX 
     ) 
    return results_generator 


if __name__ == '__main__': 
    es_client = Elasticsearch(ELASTIC_SEARCH_NODES, verify_certs=True) 

    # create_index() 
    doc1 = {"docid": 1, 'text': u"We are looking for C developers"} 
    doc2 = {"docid": 2, 'text': u"We are looking for C++ and C# developers"} 

    # create_doc(doc1) 
    # create_doc(doc2) 

    for r in find_doc("C++"): 
     print r 

搜索結果(如果我escape+"C\+\+"),其結果將是相同的):

{u'_score': 0.0, u'_type': u'material', u'_id': u'2', u'_source': {u'text': u'We are looking for C++ and C# developers', u'docid': 2}, u'_index': u'my_index'} 
{u'_score': 0.0, u'_type': u'material', u'_id': u'1', u'_source': {u'text': u'We are looking for C developers', u'docid': 1}, u'_index': u'my_index'} 

看來,得到這樣的結果是因爲在劃分令牌符號如+#not indexed,實際上,它尋找的文件中有符號C

curl 'http://localhost:9200/my_index/material/_search?pretty=true' -d '{ 
    "query" : { 
     "match_all" : { } 
    }, 
    "script_fields": { 
     "terms" : { 
      "script": "doc[field].values", 
      "params": { 
       "field": "text" 
      } 
     } 
    } 
}' 

結果:

{ 
    "took" : 3, 
    "timed_out" : false, 
    "_shards" : { 
    "total" : 5, 
    "successful" : 5, 
    "failed" : 0 
    }, 
    "hits" : { 
    "total" : 2, 
    "max_score" : 1.0, 
    "hits" : [ { 
     "_index" : "my_index", 
     "_type" : "material", 
     "_id" : "2", 
     "_score" : 1.0, 
     "fields" : { 
     "terms" : [ "and", "are", "c", "developers", "for", "looking", "we" ] 
     } 
    }, { 
     "_index" : "my_index", 
     "_type" : "material", 
     "_id" : "1", 
     "_score" : 1.0, 
     "fields" : { 
     "terms" : [ "are", "c", "developers", "for", "looking", "we" ] 
     } 
    }] 
    } 
} 

這又如何解決?與之前相關的第二個問題:是否可以僅搜索非字母數字字符,如%+

P.S.我使用Elastic 2.3.2和elasticsearch = 2.3.0。

+0

我的意思是,它有點顯而易見,是的,它被索引的方式來省略++和#,當然以後你可以找到它們兩個。我不確定我是否得到了所有的python代碼,但它看起來像你沒有設置你的分析器來對付你的領域 – Mysterion

回答

0

謝謝Andrew,我解決了這個問題。問題在於標準分析儀用於索引,而不是my_analyzer。因此,我忘了使用映射。正確的版本:

data = { 
    "settings": { 
     "analysis": { 
     "analyzer": { 
      "my_analyzer": { 
       "type": "custom", 
       "filter": [ 
        "lowercase" 
       ], 
       "tokenizer": "whitespace", 
      } 
     } 
     } 
    }, 
    "mappings": { 
     "material": { 
      "properties": { 
       "docid": { 
        "type": "integer" 
       }, 
       "text": { 
        "type": "string", 
        "analyzer": "my_analyzer" 
       } 
      } 
     } 
    } 
} 

此外,有必要重新創建索引並添加文檔。 要搜索特殊字符,我使用的是query_stringfind_doc功能的代碼:(在其令牌字符串碎)

for r in find_doc("*#"): 
    print r 

for r in find_doc(u"%"): 
    print r 

for r in find_doc("looking fo*"): 
    print r 

請求分析儀的驗證:查詢

def find_doc(value): 
    results_generator = scan(es_client, 
      query= 
      { 
       "query": { 
        "filtered" : { 
         "query" : { 
          "query_string" : { 
           "query": value, 
           "fields" : ["text"], 
           "analyzer": ANALYZER, 
           "default_operator": "AND" 
          }, 

         } 
        } 

       } 
      }, 
      index=INDEX 
     ) 
    return results_generator 

例(現wildcard-characters可以使用)

curl -XPOST "http://localhost:9200/my_index/_analyze?analyzer=my_analyzer&pretty=true" -d 'We are looking for C++ and C# developers'