2016-02-25 70 views
2

我想在Lucene 5.5.0中使用StopFilter。我試過如下:Lucene 5.5.0 StopFilter錯誤

package lucenedemo; 

import java.io.StringReader; 
import java.util.ArrayList; 
import java.util.Arrays; 
import java.util.Collections; 
import java.util.HashSet; 
import java.util.List; 
import java.util.Set; 
import java.util.Iterator; 

import org.apache.lucene.*; 
import org.apache.lucene.analysis.*; 
import org.apache.lucene.analysis.standard.*; 
import org.apache.lucene.analysis.core.StopFilter; 
import org.apache.lucene.analysis.en.EnglishAnalyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.analysis.standard.StandardTokenizer; 
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 
import org.apache.lucene.analysis.util.CharArraySet; 
import org.apache.lucene.util.AttributeFactory; 
import org.apache.lucene.util.Version; 

public class lucenedemo { 

    public static void main(String[] args) throws Exception { 
     System.out.println(removeStopWords("hello how are you? I am fine. This is a great day!")); 

    } 

    public static String removeStopWords(String strInput) throws Exception { 
     AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; 
     StandardTokenizer tokenizer = new StandardTokenizer(factory); 
     tokenizer.setReader(new StringReader(strInput)); 
     tokenizer.reset();    
     CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); 

     TokenStream streamStop = new StopFilter(tokenizer, stopWords); 
     StringBuilder sb = new StringBuilder(); 
     CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); 
     streamStop.reset(); 
     while (streamStop.incrementToken()) { 
      String term = charTermAttribute.toString(); 
      sb.append(term + " "); 
     } 

     streamStop.end(); 
     streamStop.close(); 

     tokenizer.close(); 


     return sb.toString(); 

    } 

} 

但它給我下面的錯誤:

Exception in thread "main" java.lang.IllegalStateException: TokenStream contract violation: reset()/close() call missing, reset() called multiple times, or subclass does not call super.reset(). Please see Javadocs of TokenStream class for more information about the correct consuming workflow. 
at org.apache.lucene.analysis.Tokenizer$1.read(Tokenizer.java:109) 
at org.apache.lucene.analysis.standard.StandardTokenizerImpl.zzRefill(StandardTokenizerImpl.java:527) 
at org.apache.lucene.analysis.standard.StandardTokenizerImpl.getNextToken(StandardTokenizerImpl.java:738) 
at org.apache.lucene.analysis.standard.StandardTokenizer.incrementToken(StandardTokenizer.java:159) 
at org.apache.lucene.analysis.util.FilteringTokenFilter.incrementToken(FilteringTokenFilter.java:51) 
at lucenedemo.lucenedemo.removeStopWords(lucenedemo.java:42) 
at lucenedemo.lucenedemo.main(lucenedemo.java:27) 

我究竟做錯了什麼?我已經關閉了Tokeinzer和TokenStream分支。還有什麼我在這裏失蹤?

回答

2

在過濾器上調用重置將依次重置基礎流。由於您手動重置標記生成器,然後創建一個帶有標記生成器的StopFilter它的基礎流,並重置,Tokenizer正在重置兩次。

所以只要刪除此行:

tokenizer.reset(); 
+0

該行給出了輸入字符串標記生成器。如果我刪除它,我應該在哪裏放入輸入字符串? –

+0

@JakeClawson - 哎呀,抄錯了行。現在修復。 – femtoRgon

+0

我試過了你的建議。有效。謝謝!我已經接受了答案。 –