2017-10-04 134 views
0

一個TextChunk繼this actual solution我想獲得全部TextChunk裏面的詞和它的每一個座標(actual pagetopbottomleftright)的。提取座標轉換爲PDF文件

由於TextChunk可能是一個短語,一個詞或任何,我試圖做到這一點手動,計數最後一個字的矩形和每次切割。我注意到這個手動方法可能很麻煩(我需要手動計算特殊字符等),所以我問自己ITextSharp是否提供了更簡單的方法來執行此操作。

ChunkLocationTextExtractionStragy繼承類如下:

public class Chunk 
{ 
    public Guid Id { get; set; } 
    public Rectangle Rect { get; set; } 
    public TextRenderInfo Render { get; set; } 
    public BaseFont BF { get; set; } 
    public string Text { get; set; } 
    public int FontSize { get; set; } 


    public Chunk(Rectangle rect, TextRenderInfo renderInfo) 
    { 
     this.Rect = rect; 
     this.Render = renderInfo; 
     this.Text = Render.GetText(); 
     Initialize(); 
    } 


    public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text) 
    { 
     this.Rect = rect; 
     this.Render = renderInfo; 
     this.Text = text; 
     Initialize(); 
    } 


    private void Initialize() 
    { 
     this.Id = Guid.NewGuid(); 
     this.BF = Render.GetFont(); 
     this.FontSize = ObtainFontSize(); 
    } 

    private int ObtainFontSize() 
    { 
     return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12/this.BF.GetWidthPoint(" ", 12)); 
    } 
} 

public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy 
{ 
    //Save each coordinate 
    public List<Chunk> ChunksInPage = new List<Chunk>(); 

    //Automatically called on each chunk on PDF 
    public override void RenderText(TextRenderInfo renderInfo) 
    { 
     base.RenderText(renderInfo); 
     if (string.IsNullOrWhiteSpace(renderInfo.GetText()) 
       || renderInfo == null) 
       return; 

     //Get chunk Vectors 
     var bottomLeft = renderInfo.GetDescentLine().GetStartPoint(); 
     var topRight = renderInfo.GetAscentLine().GetEndPoint(); 

     //Create Rectangle based on previous Vectors 
     var rect = new Rectangle(
          bottomLeft[Vector.I1], 
          bottomLeft[Vector.I2], 
          topRight[Vector.I1], 
          topRight[Vector.I2]); 

     if (rect == null) 
       return; 

     //Add each chunk with its coordinates 
     ChunksInPage.Add(new Chunk(rect, renderInfo)); 
    } 
} 

所以一旦我得到的文件等等,我開始是這樣的:

private void ProcessContent() 
{ 
    for (int page= 1; page <= pdfReader.NumberOfPages; page++) 
    { 
     var strategy = new LocationTextExtractionPersonalizada(); 

     var currentPageText = PdfTextExtractor.GetTextFromPage(
              pdfReader, 
              page, 
              strategy); 

     //Here is where I want to get each word with its coordinates 
     var chunksWords= ChunkRawToWord(strategy.ChunksInPage); 
    } 
} 

private List<Chunk> ChunkRawToWord(IList<Chunk> chunks) 
{ 
    if (chunks == null || chunks[0] == null) 
      return null; 

    var words = new List<Chunk>(); 
    //Poor RegEx pattern to get the word and its wathever 
    string pattern = @"[@&\w+]*(-*\/*\s*\:*\;*\,*\.*\(*\)*\%*\>*\<*)?"; 

    var something = chunks[0].Render.GetCharacterRenderInfos(); 

    for (int i = 0; i < chunks.Count; i++) 
    { 
     var wordsInChunk = Regex.Matches(
              chunks[i].Text, 
              pattern, 
              RegexOptions.IgnoreCase); 


     var rectangleChunk = new Rectangle(chunks[i].Rect); 
     for (int j = 0; j < wordsInChunk.Count; j++) 
     { 
      if (string.IsNullOrWhiteSpace(wordsInChunk[j].Value)) 
       continue; 

     var word = new Chunk(
            rectangleChunk, 
            chunks[i].Render, 
            wordsInChunk[j].ToString()); 

      if (j == 0) 
      { 
       word.Rect.Right = word.BF.GetWidthPoint(word.Text, word.FontSize); 
        words.Add(word); 
        continue; 
      } 

      if (words.Count <= 0) 
       continue; 

      word.Rect.Left = words[j - 1].Rect.Right; 
      word.Rect.Right = words[j - 1].Rect.Right + word.BF.GetWidthPoint(word.Text, word.FontSize); 
      words.Add(word); 
     } 
    } 

    return words; 
} 

之後,我就寫了一篇評論Mkl的解決方案,用「使用getCharacterRenderInfos()」來回答,我使用它並將每個單個字符都放到TextRenderInfo的List中。

我很抱歉,但我開始混合概念,找出如何應用該解決方案的方法並且讓我大開眼界。

我真的很感激這裏的一隻手。提前致謝。

回答

1

您可以使用方法TextRenderInfo.GetCharacterRenderInfos()爲您的塊中的每個字符獲取TextRenderInfo的集合。然後,您可以將單個字符重新組合爲單詞,並使用該單詞中第一個和最後一個TextRenderInfo的座標來計算包含該單詞的矩形。

在您的自定義文本提取策略:

var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."}; 
protected virtual void ParseRenderInfo(TextRenderInfo currentInfo) 
    { 
     var resultInfo = new List<TextRenderInfo>(); 
     var chars = currentInfo.GetCharacterRenderInfos(); 

     foreach (var charRenderInfo in chars) 
     { 
      resultInfo.Add(charRenderInfo); 
      var currentChar = charRenderInfo.GetText(); 
      if (_separators.Contains(currentChar)) 
      { 
       ProcessWord(currentInfo, resultInfo); 
       resultInfo.Clear(); 
      } 
     } 
     ProcessWord(currentInfo, resultInfo); 
    } 
private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks) 
    { 
     var firstRender = wordChunks.FirstOrDefault(); 
     var lastRender = wordChunks.LastOrDefault(); 
     if (firstRender == null || lastRender == null) 
     { 
      return; 
     } 
     var startCoords = firstRender.GetDescentLine().GetStartPoint(); 
     var endCoords = lastRender.GetAscentLine().GetEndPoint(); 
     var wordText = string.Join("", wordChunks.Select(x => x.GetText())); 
     var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth()); 
     _chunks.Add(new CustomTextChunk(wordText, wordLocation)); 
    }