2012-12-27 40 views
2

我有一個我用solr創建的lucene索引。 lucene版本是3.6.1。試圖讓java讀取用solr創建的lucene索引

我在網上找到了一個Java程序,讀取Lucene索引:

http://www.javacodegeeks.com/2010/05/introduction-to-apache-lucene-for-full.html

我修改了程序我的本地環境,但它總是告訴我,沒有命中,發現其中有一個查詢導致索引。在程序沒有運氣之後,我修改了代碼以使用StandardAnalyzer而不是SimpleAnalyzer。沒有運氣。

下面的代碼:

package com.javacodegeeks.lucene; 

import java.io.File; 

import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.FSDirectory; 
import org.apache.lucene.util.Version; 

public class StandardSearcher { 

    public static void main(String[] args) throws Exception { 

     File indexDir = new File("/path/to/solr/data/index/"); 
     String query = "science"; 
     int hits = 100; 

     StandardSearcher searcher = new StandardSearcher(); 
     searcher.searchIndex(indexDir, query, hits); 

    } 

    private void searchIndex(File indexDir, String queryStr, int maxHits) 
     throws Exception { 

     StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); 

     Directory directory = FSDirectory.open(indexDir); 

     IndexSearcher searcher = new IndexSearcher(directory); 
     Query query = new QueryParser(Version.LUCENE_36, "title", analyzer).parse(queryStr); 

     TopDocs topDocs = searcher.search(query, maxHits); 

     ScoreDoc[] hits = topDocs.scoreDocs; 
     for (int i = 0; i < hits.length; i++) { 
      int docId = hits[i].doc; 
      Document d = searcher.doc(docId); 
      System.out.println(d.get("filename")); 
     } 

     System.out.println("Found " + hits.length); 

    } 

} 

我在做什麼錯?通過solrconfig.xml查看我無法確定默認使用哪個分析器solr。這就是爲什麼我嘗試了SimpleAnalyzer和StandardAnalyzer。

關於如何調試的建議將不勝感激。

更新:這裏是我的架構領域:

<field name="metaDataUrl" type="string" indexed="true" stored="true" required="true"/> 
<field name="title" type="text" stored="true" indexed="true"/> 
<field name="snippet" type="text" indexed="true" stored="true"/> 
<field name="rest" type="string" stored="true" indexed="false" multiValued="true"/> 
<field name="date_indexed" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> 
<field name="all" type="text" stored="false" indexed="true" multiValued="true"/> 

而且,這裏是從schema.xml中的字段類型文本的XML:

<!-- A text field that uses WordDelimiterFilter to enable splitting and matching of                            
    words on case-change, alpha numeric boundaries, and non-alphanumeric chars,                             
    so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".                            
    Synonyms and stopwords are customized by external files, and stemming is enabled.                           
    --> 
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"> 
    <analyzer type="index"> 
    <tokenizer class="solr.WhitespaceTokenizerFactory"/> 
    <!-- in this example, we will only use synonyms at query time                                
    <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>                     
    --> 
    <!-- Case insensitive stop word removal.                                      
     add enablePositionIncrements=true in both the index and query                                
     analyzers to leave a 'gap' for more accurate phrase queries.                                
    --> 
    <filter class="solr.StopFilterFactory" 
      ignoreCase="true" 
      words="stopwords.txt" 
      enablePositionIncrements="true" 
      /> 
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> 
    <filter class="solr.LowerCaseFilterFactory"/> 
    <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> 
    </analyzer> 
    <analyzer type="query"> 
    <tokenizer class="solr.WhitespaceTokenizerFactory"/> 
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> 
    <filter class="solr.StopFilterFactory" 
      ignoreCase="true" 
      words="stopwords.txt" 
      enablePositionIncrements="true" 
      /> 
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> 
    <filter class="solr.LowerCaseFilterFactory"/> 
    <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> 
    </analyzer> 
</fieldType> 
+0

你有schema.xml嗎?如果是這樣,請發佈。 –

+0

確認您使用相同的分析儀進行索引和查詢 – naresh

+0

@naresh如何判斷哪些分析儀用於索引? – Sol

回答

1

您需要使用來創建自定義分析標記器和索引時使用的過濾器(如fieldType xml的索引部分中定義的)。將該自定義分析器作爲參數傳遞給搜索器,然後搜索應該正常工作。 SnowballPorterFilter幹「科學」嗎?可能是..

有關構建自定義分析器的詳細信息,請參閱http://whiteboardjunkie.wordpress.com/tag/custom-analyzer/。您只需要在tokenstream中調用一個接一個的過濾器()

此外,您可以使用luke(http://code.google.com/p/luke/)檢查索引並查看是否有任何文檔在標題領域完全包含「科學」。

+0

感謝分析儀鏈的教育。我認爲我的解決方案比較簡單,但最終可能需要更多分析儀。 – Sol

-1

一位同事稍微更改了我的代碼,使其看起來像下面的代碼。他還建議我搜索詞幹。這種方法的工作,我現在得到的結果從搜索建立的Lucene索引。此代碼仍需要工作,但我將其作爲概念證明發布,希望對其他人有用。

import java.io.File; 

import java.util.List; 
import org.apache.lucene.document.Fieldable; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.analysis.SimpleAnalyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.FSDirectory; 
import org.apache.lucene.util.Version; 

public class SimpleSearcher { 

    public static void main(String[] args) throws Exception { 

     File indexDir = new File("/path/to/solr/data/index/"); 
     int hits = 100; 

     SimpleSearcher searcher = new SimpleSearcher(); 
     searcher.searchIndex(indexDir, args[0], hits); 
    } 

    private void searchIndex(File indexDir, String queryStr, int maxHits) 
      throws Exception { 

     Directory directory = FSDirectory.open(indexDir); 

     IndexSearcher searcher = new IndexSearcher(directory); 
     QueryParser parser = new QueryParser(Version.LUCENE_35, 
     "title", new SimpleAnalyzer()); 
     Query query = parser.parse(queryStr); 

     TopDocs topDocs = searcher.search(query, maxHits); 

     ScoreDoc[] hits = topDocs.scoreDocs; 
     for (int i = 0; i < hits.length; i++) { 
      int docId = hits[i].doc; 
      Document d = searcher.doc(docId); 
      List<Fieldable> fields = d.getFields(); 

      System.out.println((i+1) + ". =========================================================="); 
      for (Fieldable field : fields) { 
       if (field.isStored()) { 
       System.out.println(" >> " + field.name() + " - " + d.get(field.name())); 
       } 
      } 
     } 

     System.out.println("Found " + hits.length); 
    } 
}