2014-03-30 77 views
0

首先,當涉及到Tika和Lucene時,我是一個總的noob。我正在通過Tika in Action書嘗試示例。在第5章這個例子說明:Tika in Action書中的例子Lucene StandardAnalyzer不起作用

package tikatest01; 

import java.io.File; 
import org.apache.tika.Tika; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.document.Field.Index; 
import org.apache.lucene.document.Field.Store; 
import org.apache.lucene.index.IndexWriter; 

public class LuceneIndexer { 

    private final Tika tika; 
    private final IndexWriter writer; 

    public LuceneIndexer(Tika tika, IndexWriter writer) { 
     this.tika = tika; 
     this.writer = writer; 
    } 

    public void indexDocument(File file) throws Exception { 
     Document document = new Document(); 
     document.add(new Field(
      "filename", file.getName(), 
      Store.YES, Index.ANALYZED)); 
     document.add(new Field(
      "fulltext", tika.parseToString(file), 
      Store.NO, Index.ANALYZED)); 
     writer.addDocument(document); 
    } 
} 

這主要方法:

package tikatest01; 

import java.io.File; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.store.SimpleFSDirectory; 
import org.apache.lucene.util.Version; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.tika.Tika; 

public class TikaTest01 { 

    public static void main(String[] args) throws Exception { 

     String filename = "C:\\testdoc.pdf"; 
     File file = new File(filename); 

     IndexWriter writer = new IndexWriter(
      new SimpleFSDirectory(file), 
      new StandardAnalyzer(Version.LUCENE_30), 
      MaxFieldLength.UNLIMITED); 
     try { 
      LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer); 
      indexer.indexDocument(file); 
      } 
     finally { 
      writer.close(); 
      } 
    } 
} 

我已經添加了庫蒂卡-APP-1.5.jar,Lucene的核心 - 4.7.0.jar和lucene-analysers-common-4.7.0.jar添加到項目中。

問題:

使用Lucene的Field.Index已經過時的當前版本,我應該用什麼呢?

找不到MaxFieldLength。我缺少一個導入?

+0

使用Lucene 3.6或更全面地瞭解所有這些API。 – bmargulies

+0

更全面地學習API正是我閱讀書籍的原因。但是,一切似乎都寫在Lucene 3.x上,而不是4.x:S –

+0

好的。部分回答了我的第二個問題。我需要將lucene-analysers-common-4.7.0.jar添加到我的項目中,並導入org.apache.lucene.analysis.standard.StandardAnalyzer MaxFieldLength問題仍然存在。更新了這個問題。 –

回答

3

對於Lucene的4.7此代碼爲索引:

package tikatest01; 

import java.io.File; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field.Store; 
import org.apache.lucene.document.TextField; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.tika.Tika; 

public class LuceneIndexer { 

    private final Tika tika; 
    private final IndexWriter writer; 

    public LuceneIndexer(Tika tika, IndexWriter writer) { 
     this.tika = tika; 
     this.writer = writer; 
    } 

    public void indexDocument(File file) throws Exception { 
     Document document = new Document(); 
     document.add(new TextField(
       "filename", file.getName(), Store.YES)); 
     document.add(new TextField(
       "fulltext", tika.parseToString(file), Store.NO)); 
     writer.addDocument(document); 
    } 
} 

這碼主類:

package tikatest01; 

import java.io.File; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.index.IndexWriterConfig; 
import org.apache.lucene.store.SimpleFSDirectory; 
import org.apache.lucene.util.Version; 
import org.apache.tika.Tika; 

public class TikaTest01 { 

    public static void main(String[] args) throws Exception { 

     String dirname = "C:\\MyTestDir\\"; 
     File dir = new File(dirname); 


     IndexWriter writer = new IndexWriter(
      new SimpleFSDirectory(dir), 
      new IndexWriterConfig(
       Version.LUCENE_47, 
       new StandardAnalyzer(Version.LUCENE_47))); 
     try { 
      LuceneIndexer indexer = new LuceneIndexer(new Tika(), writer); 
      indexer.indexDocument(dir); 
      } 
     finally { 
      writer.close(); 
      } 
    } 
}