5

我使用Lucene的Highlighter類來突出顯示匹配搜索結果的片段,它效果很好。我想從使用StandardAnalyzer進行搜索切換到英語分析器,這將執行詞條的填充。帶有詞幹分析器的Lucene熒光筆

搜索結果不錯,但現在熒光筆並不總是找到匹配項。下面是我要找一個例子:

document field text 1: Everyone likes goats. 

document field text 2: I have a goat that eats everything. 

使用EnglishAnalyzer並搜索「山羊」,這兩個文件是匹配的,但熒光筆只能夠找到一個匹配的片段從文件2.是否有一種方法讓熒光筆返回兩份文件的數據?

我知道這些標記的字符是不同的,但同一個標記仍然存在,所以它似乎有理由突出顯示該標記存在的位置。

如果有幫助,這是使用Lucene 3.5。

回答

7

我找到了解決這個問題的辦法。我從使用Highlighter類改爲使用FastVectorHighlighter。看起來我也會選擇一些速度改進(以存儲術語矢量數據爲代價)。爲了以後任何人遇到這個問題的好處,下面是一個單元測試,顯示了這一切如何一起工作:

package com.sample.index; 

import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.en.EnglishAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.IndexReader; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriterConfig; 
import org.apache.lucene.queryParser.ParseException; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.search.vectorhighlight.*; 
import org.apache.lucene.store.RAMDirectory; 
import org.apache.lucene.util.Version; 
import org.junit.Before; 
import org.junit.Test; 

import java.io.IOException; 
import java.util.ArrayList; 
import java.util.List; 

import static junit.framework.Assert.assertEquals; 

public class TestIndexStuff { 
    public static final String FIELD_NORMAL = "normal"; 
    public static final String[] PRE_TAGS = new String[]{"["}; 
    public static final String[] POST_TAGS = new String[]{"]"}; 
    private IndexSearcher searcher; 
    private Analyzer analyzer = new EnglishAnalyzer(Version.LUCENE_35); 

    @Before 
    public void init() throws IOException { 
     RAMDirectory idx = new RAMDirectory(); 
     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, analyzer); 

     IndexWriter writer = new IndexWriter(idx, config); 
     addDocs(writer); 
     writer.close(); 

     searcher = new IndexSearcher(IndexReader.open(idx)); 
    } 

    private void addDocs(IndexWriter writer) throws IOException { 
     for (String text : new String[] { 
       "Pretty much everyone likes goats.", 
       "I have a goat that eats everything.", 
       "goats goats goats goats goats"}) { 
      Document doc = new Document(); 
      doc.add(new Field(FIELD_NORMAL, text, Field.Store.YES, 
        Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); 
      writer.addDocument(doc); 
     } 
    } 

    private FastVectorHighlighter makeHighlighter() { 
     FragListBuilder fragListBuilder = new SimpleFragListBuilder(200); 
     FragmentsBuilder fragmentBuilder = new SimpleFragmentsBuilder(PRE_TAGS, POST_TAGS); 
     return new FastVectorHighlighter(true, true, fragListBuilder, fragmentBuilder); 
    } 

    @Test 
    public void highlight() throws ParseException, IOException { 
     Query query = new QueryParser(Version.LUCENE_35, FIELD_NORMAL, analyzer) 
        .parse("goat"); 
     FastVectorHighlighter highlighter = makeHighlighter(); 
     FieldQuery fieldQuery = highlighter.getFieldQuery(query); 

     TopDocs topDocs = searcher.search(query, 10); 
     List<String> fragments = new ArrayList<String>(); 
     for (ScoreDoc scoreDoc : topDocs.scoreDocs) { 
      fragments.add(highlighter.getBestFragment(fieldQuery, searcher.getIndexReader(), 
        scoreDoc.doc, FIELD_NORMAL, 10000)); 
     } 

     assertEquals(3, fragments.size()); 
     assertEquals("[goats] [goats] [goats] [goats] [goats]", fragments.get(0).trim()); 
     assertEquals("Pretty much everyone likes [goats].", fragments.get(1).trim()); 
     assertEquals("I have a [goat] that eats everything.", fragments.get(2).trim()); 
    } 
}