2015-12-16 18 views
0

我想突出顯示一組PDF文件中的幾個關鍵字。首先,我們必須識別單個單詞並將其與我的關鍵字進行匹配。我找到一個例子:itextsharp:在將文字拼接拆分爲單詞時文字被破壞

class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy 
{ 
    //Hold each coordinate 
    public List<RectAndText> myPoints = new List<RectAndText>(); 

    List<string> topicTerms; 
    public MyLocationTextExtractionStrategy(List<string> topicTerms) 
    { 
     this.topicTerms = topicTerms; 
    } 

    //Automatically called for each chunk of text in the PDF 
    public override void RenderText(TextRenderInfo renderInfo) 
    { 
     base.RenderText(renderInfo); 


     //Get the bounding box for the chunk of text 
     var bottomLeft = renderInfo.GetDescentLine().GetStartPoint(); 
     var topRight = renderInfo.GetAscentLine().GetEndPoint(); 

     //Create a rectangle from it 
     var rect = new iTextSharp.text.Rectangle(
               bottomLeft[Vector.I1], 
               bottomLeft[Vector.I2], 
               topRight[Vector.I1], 
               topRight[Vector.I2] 
               ); 

     //Add this to our main collection 
     //filter the meaingless words 
     string text = renderInfo.GetText(); 
     this.myPoints.Add(new RectAndText(rect, renderInfo.GetText())); 

但是,我發現這麼多的單詞都壞了。例如,「停止」將是「st」和「op」。有沒有其他方法可以識別單個單詞及其位置?

+1

我總是可以發現我的舊[代碼](http://stackoverflow.com/a/23915452/231316)!無論如何,請參閱[mkl的答案](http://stackoverflow.com/a/20049810/231316)關於使用IsChunkAtWordBoundary()來判斷兩個「塊」是否應該是一個「單詞」。 –

+0

感謝您的舊代碼。這確實有很大的幫助。我會在稍後嘗試你的建議。再次感謝。 –

+0

無論如何,我發現收集單個單詞的更好方法是在GetResultantText()中,而不是RenderText()。 –

回答

0

當你想收集單個單詞及其協調時,更好的方法是覆蓋現有的LocationTextExtractionStrategy。這裏是我的代碼:

public virtual String GetResultantText(ITextChunkFilter chunkFilter){ 
     if (DUMP_STATE) { 
      DumpState(); 
     } 

     List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter); 
     filteredTextChunks.Sort(); 

     List<RectAndText> tmpList = new List<RectAndText>(); 

     StringBuilder sb = new StringBuilder(); 
     TextChunk lastChunk = null; 
     foreach (TextChunk chunk in filteredTextChunks) { 

      if (lastChunk == null){ 
       sb.Append(chunk.Text); 
       var startLocation = chunk.StartLocation; 
       var endLocation = chunk.EndLocation; 

       var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]); 
       tmpList.Add(new RectAndText(rect, chunk.Text)); 
      } else { 
       if (chunk.SameLine(lastChunk)){ 
        // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space 
        if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text)) 
        { 
         sb.Append(' '); 
         if (tmpList.Count > 0) 
         { 
          mergeAndStoreChunk(tmpList); 
          tmpList.Clear(); 
         } 

        } 

        sb.Append(chunk.Text); 

        var startLocation = chunk.StartLocation; 
        var endLocation = chunk.EndLocation; 

        var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]); 
        ////var topRight = renderInfo.GetAscentLine().GetEndPoint(); 
        tmpList.Add(new RectAndText(rect,chunk.Text)); 

       } else { 
        sb.Append('\n'); 
        sb.Append(chunk.Text); 

       } 
      } 
      lastChunk = chunk; 
     } 

     return sb.ToString(); 
    } 

    private void mergeAndStoreChunk(List<RectAndText> tmpList) 
    { 
     RectAndText mergedChunk = tmpList[0]; 
     int tmpListCount = tmpList.Count(); 
     for (int i = 1; i < tmpListCount; i++) 
     { 
      RectAndText nowChunk = tmpList[i]; 
      mergedChunk.Rect.Right = nowChunk.Rect.Right; 
      mergedChunk.Text += nowChunk.Text; 
     } 
     this.myPoints.Add(mergedChunk); 
    } 

myPoints是一個列表,它將返回所有我們想要的。