2011-05-08 32 views
6

除了在文本內容字段上使用tf-idf相似度的標準字詞搜索外,我還希望基於數字字段的「相似性」進行評分。這種相似性將取決於查詢中和文檔中的值之間的距離(例如,具有m = [用戶輸入],s = 0.5的高斯)Lucene對數字字段的自定義評分

即,假設文件代表的人,和個人文檔具有兩個字段:

  • 說明(全文)
  • 年齡(數字)。

我想找到的文件像

說明:(XYZ)年齡:30

但年齡是沒有在過濾器,而是得分的一部分(30歲乘數的人將爲1.0,爲25歲的人0.8等)

這可以通過合理的方式來實現嗎?

編輯:最後我發現這可以通過包裝ValueSourceQuery和TermQuery與CustomScoreQuery來完成。請參閱下面的解決方案

編輯2:隨着Lucene的快速變化版本,我只想補充說它已經在Lucene 3.0(Java)上測試過了。

回答

8

好了,所以這裏的(一個冗長位)證明的概念作爲一個完整的JUnit測試來實現。還沒有測試它的效率,但對於大型索引,但從我讀過的熱身後應該表現良好,只要有足夠的RAM可用於緩存數字字段。

package tests; 

    import org.apache.lucene.analysis.Analyzer; 
    import org.apache.lucene.analysis.WhitespaceAnalyzer; 
    import org.apache.lucene.document.Document; 
    import org.apache.lucene.document.Field; 
    import org.apache.lucene.document.NumericField; 
    import org.apache.lucene.index.IndexWriter; 
    import org.apache.lucene.queryParser.QueryParser; 
    import org.apache.lucene.search.IndexSearcher; 
    import org.apache.lucene.search.Query; 
    import org.apache.lucene.search.ScoreDoc; 
    import org.apache.lucene.search.TopDocs; 
    import org.apache.lucene.search.function.CustomScoreQuery; 
    import org.apache.lucene.search.function.IntFieldSource; 
    import org.apache.lucene.search.function.ValueSourceQuery; 
    import org.apache.lucene.store.Directory; 
    import org.apache.lucene.store.RAMDirectory; 
    import org.apache.lucene.util.Version; 

    import junit.framework.TestCase; 

    public class AgeAndContentScoreQueryTest extends TestCase 
    { 
    public class AgeAndContentScoreQuery extends CustomScoreQuery 
    { 
     protected float peakX; 
     protected float sigma; 

     public AgeAndContentScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery, float peakX, float sigma) { 
      super(subQuery, valSrcQuery); 
      this.setStrict(true); // do not normalize score values from ValueSourceQuery! 
      this.peakX = peakX; // age for which the age-relevance is best 
      this.sigma = sigma; 
     } 

     @Override 
     public float customScore(int doc, float subQueryScore, float valSrcScore){ 
      // subQueryScore is td-idf score from content query 
      float contentScore = subQueryScore; 

      // valSrcScore is a value of date-of-birth field, represented as a float 
      // let's convert age value to gaussian-like age relevance score 
      float x = (2011 - valSrcScore); // age 
      float ageScore = (float) Math.exp(-Math.pow(x - peakX, 2)/2*sigma*sigma); 

      float finalScore = ageScore * contentScore; 

      System.out.println("#contentScore: " + contentScore); 
      System.out.println("#ageValue:  " + (int)valSrcScore); 
      System.out.println("#ageScore:  " + ageScore); 
      System.out.println("#finalScore: " + finalScore); 
      System.out.println("+++++++++++++++++"); 

      return finalScore; 
     } 
    } 

    protected Directory directory; 
    protected Analyzer analyzer = new WhitespaceAnalyzer(); 
    protected String fieldNameContent = "content"; 
    protected String fieldNameDOB = "dob"; 

    protected void setUp() throws Exception 
    { 
     directory = new RAMDirectory(); 
     analyzer = new WhitespaceAnalyzer(); 

     // indexed documents 
     String[] contents = {"foo baz1", "foo baz2 baz3", "baz4"}; 
     int[] dobs = {1991, 1981, 1987}; // date of birth 

     IndexWriter writer = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED); 
     for (int i = 0; i < contents.length; i++) 
     { 
      Document doc = new Document(); 
      doc.add(new Field(fieldNameContent, contents[i], Field.Store.YES, Field.Index.ANALYZED)); // store & index 
      doc.add(new NumericField(fieldNameDOB, Field.Store.YES, true).setIntValue(dobs[i]));  // store & index 
      writer.addDocument(doc); 
     } 
     writer.close(); 
    } 

    public void testSearch() throws Exception 
    { 
     String inputTextQuery = "foo bar"; 
     float peak = 27.0f; 
     float sigma = 0.1f; 

     QueryParser parser = new QueryParser(Version.LUCENE_30, fieldNameContent, analyzer); 
     Query contentQuery = parser.parse(inputTextQuery); 

     ValueSourceQuery dobQuery = new ValueSourceQuery(new IntFieldSource(fieldNameDOB)); 
     // or: FieldScoreQuery dobQuery = new FieldScoreQuery(fieldNameDOB,Type.INT); 

     CustomScoreQuery finalQuery = new AgeAndContentScoreQuery(contentQuery, dobQuery, peak, sigma); 

     IndexSearcher searcher = new IndexSearcher(directory); 
     TopDocs docs = searcher.search(finalQuery, 10); 

     System.out.println("\nDocuments found:\n"); 
     for(ScoreDoc match : docs.scoreDocs) 
     { 
      Document d = searcher.doc(match.doc); 
      System.out.println("CONTENT: " + d.get(fieldNameContent)); 
      System.out.println("D.O.B.: " + d.get(fieldNameDOB)); 
      System.out.println("SCORE: " + match.score); 
      System.out.println("-----------------"); 
     } 
    } 
    } 
+0

這可以推廣到'ValueSourceQuery'-S的任意數字作爲CustomScoreQuery具有可變參數的構造函數。要覆蓋的分數方法是'公共浮動自定義分數(int doc,float subQueryScore,float [] valSrcScore)''。 – 2011-05-09 15:49:49