2014-09-05 56 views
1

我正在努力與Lucene中的BM25Similarity類(link)。Web上提供的所有示例都涉及到較早的實現(link),我懇求一個指針如何修改下面的標準玩具示例以包含BM25相似(創建索引以及執行搜尋)Lucene中的BM25 4.9

import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.StringField; 
import org.apache.lucene.document.TextField; 
import org.apache.lucene.index.DirectoryReader; 
import org.apache.lucene.index.IndexReader; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriterConfig; 
import org.apache.lucene.queryparser.classic.ParseException; 
import org.apache.lucene.queryparser.classic.QueryParser; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TopScoreDocCollector; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.RAMDirectory; 
import org.apache.lucene.util.Version; 

import java.io.IOException; 

public class HelloLucene { 
    public static void main(String[] args) throws IOException, ParseException { 
    // Specify the analyzer for tokenizing text. 
    // The same analyzer should be used for indexing and searching 
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); 

    // Create the index 
    Directory index = new RAMDirectory(); 

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); 

    IndexWriter w = new IndexWriter(index, config); 
    addDoc(w, "Lucene in Action", "193398817"); 
    addDoc(w, "Lucene for Dummies", "55320055Z"); 
    addDoc(w, "Managing Gigabytes", "55063554A"); 
    addDoc(w, "The Art of Computer Science", "9900333X"); 
    w.close(); 

    // Query 
    String querystr = args.length > 0 ? args[0] : "lucene"; 

    // the "title" arg specifies the default field to use 
    // when no field is explicitly specified in the query. 
    Query q = new QueryParser(Version.LUCENE_4_9, "title", analyzer).parse(querystr); 

    // Search 
    int hitsPerPage = 10; 
    IndexReader reader = DirectoryReader.open(index); 
    IndexSearcher searcher = new IndexSearcher(reader); 
    TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); 
    searcher.search(q, collector); 
    ScoreDoc[] hits = collector.topDocs().scoreDocs; 

    // Display results 
    System.out.println("Found " + hits.length + " hits."); 
    for(int i=0;i<hits.length;++i) { 
     int docId = hits[i].doc; 
     Document d = searcher.doc(docId); 
     System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title")); 
    } 
    reader.close(); 
    } 

    private static void addDoc(IndexWriter w, String title, String isbn) throws IOException { 
    Document doc = new Document(); 
    doc.add(new TextField("title", title, Field.Store.YES)); 

    // use a string field for isbn because we don't want it tokenized 
    doc.add(new StringField("isbn", isbn, Field.Store.YES)); 
    w.addDocument(doc); 
    } 
} 

回答

6

你只需要設置相似性IndexSearcher的:

searcher.setSimilarity(new BM25Similarity(1.2, 0.75)); 

而且IndexWriterConfig:

config.setSimilarity(new BM25Similarity(1.2, 0.75));