2012-09-06 20 views
0

我需要在僅使用Lucene的文檔中查找像「1年」這樣的短語出現次數。Lucene |文檔中出現短語的次數

我知道我們可以使用TermDocs.seek(Term)然後TermDocs.freq()找到術語「1」或「年」的出現次數。

有什麼方法可以找到像這樣的短語的發生no?

+0

重複http://stackoverflow.com/questions/9211792/lucene-net-phrase-count/9213193#9213193 –

回答

1

一個小小的研究,whoooo,我知道了......

首先創建索引讀者對象這樣的...

Dim indexReader As Lucene.Net.Index.IndexReader 
indexReader = New Lucene.Net.Index.IndexReader.Open(INDEX_DIRECTORY, True) 

然後創建附近跨度查詢你的每一個這樣的術語...

Dim spanQuery1 As SpanTermQuery = New SpanTermQuery(New Term(FIELD, "2")) 
Dim spanQuery2 As SpanTermQuery = New SpanTermQuery(New Term(FIELD, "year")) 

Dim near As SpanNearQuery = New SpanNearQuery(New SpanQuery() 
              {spanQuery1, spanQuery2}, 
               0, 
               False) 

通過每個跨度創建一個跨對象來存儲所有匹配的跨度....

 Dim spans As Spans = near.GetSpans(indexReader) 

迭代獲得跨度出現次數的多少..

 Dim num As Integer = 0 

     While (spans.Next) 
      num += 1 
     End While 

現在NUM包含OCCURENCES詞組的數量「1年」。

目前它顯示沒有出現所有文檔。你可以跳過使用

spans.SkipTo(i) 

任何文件,並可以找到短語是否在當前文檔或不

spans.doc() 

我有這個想法從This PPT。可能是這可以幫助你回答你的許多其他問題...

0

我在 lucene.net中創建了一個用於實現垃圾過濾功能的類文件。它只是工作喜歡的一個Java。 這是在C#中,但我想它即使你轉換它的作品。

using System; 
using System.Collections.Generic; 
using System.Linq; 
using System.Text; 
using Lucene.Net.Analysis.Tokenattributes; 
using Lucene.Net.Util; 
namespace Lucene.Net.Analysis.Shingle 
{ 

/** 
* A ShingleFilter constructs shingles (token n-grams) from a token stream. 
* In other words, it creates combinations of tokens as a single token. 
* 
* For example, the sentence "please divide this sentence into shingles" 
* might be tokenized into shingles "please divide", "divide this", 
* "this sentence", "sentence into", and "into shingles". 
* 
* This filter handles position increments > 1 by inserting filler tokens 
* (tokens with termtext "_"). It does not handle a position increment of 0. 
*/ 

public sealed class ShingleFilter : TokenFilter 
{ 

    private LinkedList<State> shingleBuf = new LinkedList<State>(); 
    private StringBuilder[] shingles; 
    private String tokenType = "shingle"; 

    /** 
    * filler token for when positionIncrement is more than 1 
    */ 
    public static readonly char[] FILLER_TOKEN = { '_' }; 


    /** 
    * default maximum shingle size is 2. 
    */ 
    public const int DEFAULT_MAX_SHINGLE_SIZE = 2; 

    /** 
    * The string to use when joining adjacent tokens to form a shingle 
    */ 
    public const String TOKEN_SEPARATOR = " "; 

    /** 
    * By default, we output unigrams (individual tokens) as well as shingles 
    * (token n-grams). 
    */ 
    private bool outputUnigrams = true; 

    /** 
    * maximum shingle size (number of tokens) 
    */ 
    private int maxShingleSize; 

    /** 
    * Constructs a ShingleFilter with the specified single size from the 
    * {@link TokenStream} <code>input</code> 
    * 
    * @param input input stream 
    * @param maxShingleSize maximum shingle size produced by the filter. 
    */ 
    public ShingleFilter(TokenStream input, int maxShingleSize) 
     : base(input) 
    { 
     SetMaxShingleSize(maxShingleSize); 
     this.termAtt = AddAttribute<ITermAttribute>(); ; 
     this.offsetAtt = AddAttribute<IOffsetAttribute>(); ; 
     this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); ; 
     this.typeAtt = AddAttribute<ITypeAttribute>(); ; 
    } 

    /** 
    * Construct a ShingleFilter with default shingle size. 
    * 
    * @param input input stream 
    */ 
    public ShingleFilter(TokenStream input) 
     : this(input, DEFAULT_MAX_SHINGLE_SIZE) 
    { 
    } 

    /** 
    * Construct a ShingleFilter with the specified token type for shingle tokens. 
    * 
    * @param input input stream 
    * @param tokenType token type for shingle tokens 
    */ 
    public ShingleFilter(TokenStream input, String tokenType) 
     : this(input, DEFAULT_MAX_SHINGLE_SIZE) 
    { 
     setTokenType(tokenType); 
    } 

    /** 
    * Set the type of the shingle tokens produced by this filter. 
    * (default: "shingle") 
    * 
    * @param tokenType token tokenType 
    */ 
    public void setTokenType(String tokenType) 
    { 
     this.tokenType = tokenType; 
    } 

    /** 
    * Shall the output stream contain the input tokens (unigrams) as well as 
    * shingles? (default: true.) 
    * 
    * @param outputUnigrams Whether or not the output stream shall contain 
    * the input tokens (unigrams) 
    */ 
    public void SetOutputUnigrams(bool outputUnigrams) 
    { 
     this.outputUnigrams = outputUnigrams; 
    } 

    /** 
    * Set the max shingle size (default: 2) 
    * 
    * @param maxShingleSize max size of output shingles 
    */ 
    public void SetMaxShingleSize(int maxShingleSize) 
    { 
     if (maxShingleSize < 2) 
     { 
      throw new ArgumentException("Max shingle size must be >= 2"); 
     } 
     shingles = new StringBuilder[maxShingleSize]; 
     for (int i = 0; i < shingles.Length; i++) 
     { 
      shingles[i] = new StringBuilder(); 
     } 
     this.maxShingleSize = maxShingleSize; 
    } 

    /** 
    * Clear the StringBuilders that are used for storing the output shingles. 
    */ 
    private void ClearShingles() 
    { 
     for (int i = 0; i < shingles.Length; i++) 
     { 
      shingles[i].Length = 0; 
     } 
    } 

    private AttributeSource.State nextToken; 
    private int shingleBufferPosition; 
    private int[] endOffsets; 

    /* (non-Javadoc) 
    * @see org.apache.lucene.analysis.TokenStream#next() 
    */ 
    public sealed override bool IncrementToken() 
    { 
     while (true) 
     { 
      if (nextToken == null) 
      { 
       if (!FillShingleBuffer()) 
       { 
        return false; 
       } 
      } 

      nextToken = shingleBuf.First.Value; 

      if (outputUnigrams) 
      { 
       if (shingleBufferPosition == 0) 
       { 
        RestoreState(nextToken); 
        posIncrAtt.PositionIncrement = 1; 
        shingleBufferPosition++; 
        return true; 
       } 
      } 
      else if (shingleBufferPosition % this.maxShingleSize == 0) 
      { 
       shingleBufferPosition++; 
      } 

      if (shingleBufferPosition < shingleBuf.Count) 
      { 
       RestoreState(nextToken); 
       typeAtt.Type = tokenType; 
       offsetAtt.SetOffset(offsetAtt.StartOffset, endOffsets[shingleBufferPosition]); 
       StringBuilder buf = shingles[shingleBufferPosition]; 
       int termLength = buf.Length; 
       char[] TermBuffer = termAtt.TermBuffer(); 
       if (TermBuffer.Length < termLength) 
        TermBuffer = termAtt.ResizeTermBuffer(termLength); 
       buf.CopyTo(0, TermBuffer, 0, termLength); 
       termAtt.SetTermLength(termLength); 
       if ((!outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) 
       { 
        posIncrAtt.PositionIncrement = 1; 
       } 
       else 
       { 
        posIncrAtt.PositionIncrement = 0; 
       } 
       shingleBufferPosition++; 
       if (shingleBufferPosition == shingleBuf.Count) 
       { 
        nextToken = null; 
        shingleBufferPosition = 0; 
       } 
       return true; 
      } 
      else 
      { 
       nextToken = null; 
       shingleBufferPosition = 0; 
      } 
     } 
    } 

    private int numFillerTokensToInsert; 
    private AttributeSource.State currentToken; 
    private bool hasCurrentToken; 

    private ITermAttribute termAtt; 
    private IOffsetAttribute offsetAtt; 
    private IPositionIncrementAttribute posIncrAtt; 
    private ITypeAttribute typeAtt; 

    /** 
    * Get the next token from the input stream and push it on the token buffer. 
    * If we encounter a token with position increment > 1, we put filler tokens 
    * on the token buffer. 
    * <p/> 
    * Returns null when the end of the input stream is reached. 
    * @return the next token, or null if at end of input stream 
    * @throws IOException if the input stream has a problem 
    */ 
    private bool GetNextToken() 
    { 

     while (true) 
     { 
      if (numFillerTokensToInsert > 0) 
      { 
       if (currentToken == null) 
       { 
        currentToken = CaptureState(); 
       } 
       else 
       { 
        RestoreState(currentToken); 
       } 
       numFillerTokensToInsert--; 
       // A filler token occupies no space 
       offsetAtt.SetOffset(offsetAtt.StartOffset, offsetAtt.StartOffset); 
       termAtt.SetTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.Length); 
       return true; 
      } 

      if (hasCurrentToken) 
      { 
       if (currentToken != null) 
       { 
        RestoreState(currentToken); 
        currentToken = null; 
       } 
       hasCurrentToken = false; 
       return true; 
      } 

      if (!input.IncrementToken()) return false; 
      hasCurrentToken = true; 

      if (posIncrAtt.PositionIncrement > 1) 
      { 
       numFillerTokensToInsert = posIncrAtt.PositionIncrement - 1; 
      } 
     } 
    } 

    /** 
    * Fill the output buffer with new shingles. 
    * 
    * @throws IOException if there's a problem getting the next token 
    */ 
    private bool FillShingleBuffer() 
    { 
     bool addedToken = false; 
     /* 
     * Try to fill the shingle buffer. 
     */ 
     do 
     { 
      if (GetNextToken()) 
      { 
       shingleBuf.AddLast(CaptureState()); 
       if (shingleBuf.Count > maxShingleSize) 
       { 
        shingleBuf.RemoveFirst(); 
       } 
       addedToken = true; 
      } 
      else 
      { 
       break; 
      } 
     } while (shingleBuf.Count < maxShingleSize); 

     if (shingleBuf.Count == 0) 
     { 
      return false; 
     } 

     /* 
     * If no new token could be added to the shingle buffer, we have reached 
     * the end of the input stream and have to discard the least recent token. 
     */ 
     if (!addedToken) 
     { 
      shingleBuf.RemoveFirst(); 
     } 

     if (shingleBuf.Count == 0) 
     { 
      return false; 
     } 

     ClearShingles(); 

     endOffsets = new int[shingleBuf.Count]; 
     // Set all offsets to 0 
     endOffsets.Initialize(); 

     int i = 0; 
     for (IEnumerator<State> it = shingleBuf.GetEnumerator(); it.MoveNext();) 
     { 
      RestoreState(it.Current); 
      for (int j = i; j < shingles.Length; j++) 
      { 
       if (shingles[j].Length != 0) 
       { 
        shingles[j].Append(TOKEN_SEPARATOR); 
       } 
       shingles[j].Append(termAtt.TermBuffer().Take(termAtt.TermLength()).ToArray()); 
      } 

      endOffsets[i] = offsetAtt.EndOffset; 
      i++; 
     } 

     return true; 
    } 

    public override void Reset() 
    { 
     base.Reset(); 
     nextToken = null; 
     shingleBufferPosition = 0; 
     shingleBuf.Clear(); 
     numFillerTokensToInsert = 0; 
     currentToken = null; 
     hasCurrentToken = false; 
    } 
} 

}