2014-09-10 29 views
0

我寫了一個簡短的庫來從PDF文檔中提取錨文本的位置,以便稍後可以將圖像渲染爲BufferedImage並將HTML形式覆蓋在其上。它可以工作,但我必須將x,y和寬度和高度縮放2以使其正確工作。我正在通過將圖像渲染到BufferedImage然後用紅色繪製邊界框來測試它。基本上......爲什麼這個因數是2 ...我能指望這個因數是不變的嗎?自然,我意識到如果圖像的大小改變x,y座標和寬度,高度將適當地縮放。轉換成圖像可能縮放它?PDFBox TextPosition x,y和width,height off by factor 2

這裏是我的代碼:

AnchorTextRipper.java

import java.awt.Rectangle; 
import java.io.IOException; 
import java.util.HashMap; 

import org.apache.pdfbox.util.PDFTextStripper; 
import org.apache.pdfbox.util.TextPosition; 

public class AnchorTextRipper extends PDFTextStripper { 
    protected enum ScanState { 
     INIT, 
     SEARCHING, 
     FOUND_POSSIBLE, 
     SCANNING_ANCHOR, 
     DONE 
    } 

    protected HashMap<String, Rectangle> anchors = new HashMap<String, Rectangle>(); 

    // Scanning variables 
    protected ScanState state = ScanState.INIT; 
    protected TextPosition lastFoundAnchor; 
    protected StringBuilder lastFoundAnchorText; 
    protected Double lastWidth; 
    protected Rectangle lastFoundAnchorRect; 

    public AnchorTextRipper() throws IOException { 
     super(); 
     this.setSortByPosition(true); 
    } 

    /** 
    * A method provided as an event interface to allow a subclass to perform 
    * some specific functionality when text needs to be processed. 
    * 
    * @param text 
    *   The text to be processed 
    */ 
    @Override 
    protected void processTextPosition(TextPosition text) { 
     switch(state) { 
     case INIT: 
      state = ScanState.SEARCHING; 
      lastFoundAnchor = null; 
      lastFoundAnchorText = new StringBuilder(); 
      lastWidth = 0.0; 
      lastFoundAnchorRect = null; 
      break; 
     case SEARCHING: 
      if (text.getCharacter().equals("$")) { 
       state = ScanState.FOUND_POSSIBLE; 
       lastFoundAnchor = text; 
      } 
      break; 
     case FOUND_POSSIBLE: 
      if (text.getCharacter().equals("{")) { 
       state = ScanState.SCANNING_ANCHOR; 
      } 
      break; 
     case SCANNING_ANCHOR: 
      if (text.getCharacter().equals("}")) { 
       state = ScanState.DONE; 
       break; 
      } 

      lastFoundAnchorText.append(text.getCharacter()); 
      break; 
     case DONE: 
      System.out.println(String.format("%f, %f (%f, %f) [%f, %f]", lastFoundAnchor.getX(), lastFoundAnchor.getY(), lastFoundAnchor.getXScale(), lastFoundAnchor.getYScale(), lastFoundAnchor.getWidth(), lastFoundAnchor.getHeight())); 

      lastFoundAnchorRect = new Rectangle((int)Math.round(lastFoundAnchor.getX() * 2) , (int)Math.round((lastFoundAnchor.getY() * 2) - lastFoundAnchor.getHeight() * 2), (int)Math.round(lastWidth) * 2, (int)Math.round(lastFoundAnchor.getHeight() * 2)); 
      anchors.put(lastFoundAnchorText.toString(), lastFoundAnchorRect); 
      state = ScanState.INIT; 
      break; 
     } 

     if (state != ScanState.SEARCHING) { 
      lastWidth += text.getWidth(); 
     } 
    } 
} 

AnchorTextLocatorService.java

import org.apache.pdfbox.exceptions.CryptographyException; 
import org.apache.pdfbox.pdmodel.PDDocument; 
import org.apache.pdfbox.pdmodel.PDPage; 
import org.apache.pdfbox.pdmodel.common.PDStream; 

public class AnchorTextLocatorService { 
    protected AnchorTextRipper ripper = new AnchorTextRipper(); 

    public AnchorTextLocatorService(String filename) throws IOException { 
     PDDocument document = null; 

     try { 
      document = PDDocument.load(filename); 
      if (document.isEncrypted()) { 
       document.decrypt(""); 
      } 

      @SuppressWarnings("unchecked") 
      List<PDPage> allPages = document.getDocumentCatalog().getAllPages(); 

      for (int i = 0; i < allPages.size(); i++) { 
       PDPage page = (PDPage) allPages.get(i); 
       PDStream contents = page.getContents(); 
       if (contents != null) { 
        ripper.processStream(page, page.findResources(), page.getContents().getStream()); 
       } 
      } 
     } catch (CryptographyException e) { 
      // TODO Auto-generated catch block 
      e.printStackTrace(); 
     } finally { 
      if (document != null) { 
       document.close(); 
      } 
     } 
    } 



    public HashMap<String, Rectangle> getAnchors() { 
     return ripper.anchors; 
    } 

    public Rectangle getAnchorRect(String anchorText) { 
     return ripper.anchors.get(anchorText); 
    } 
} 

Application.java

import java.awt.Color; 
import java.awt.Graphics2D; 
import java.awt.Rectangle; 
import java.awt.image.BufferedImage; 
import java.io.File; 
import java.util.Map.Entry; 

import javax.imageio.ImageIO; 

import org.apache.pdfbox.pdmodel.PDDocument; 
import org.apache.pdfbox.pdmodel.PDPage; 

public class Application { 


    /** 
    * This will print the documents data. 
    * 
    * @param args 
    *   The command line arguments. 
    * 
    * @throws Exception 
    *    If there is an error parsing the document. 
    */ 
    public static void main(String[] args) throws Exception { 
     PDDocument document = PDDocument.load("test.pdf"); 
     if (document.isEncrypted()) { 
      document.decrypt(""); 
     } 

     PDPage page = (PDPage)document.getDocumentCatalog().getAllPages().get(0); 
     BufferedImage bi = page.convertToImage(); 

     AnchorTextLocatorService ats = new AnchorTextLocatorService("test.pdf"); 

     for (Entry<String, Rectangle> anchor : ats.getAnchors().entrySet()) { 
      System.out.println(anchor.getKey() + " => " + anchor.getValue()); 

      Graphics2D g = (Graphics2D)bi.getGraphics(); 
      g.setColor(Color.RED); 
      g.drawRect(anchor.getValue().x, anchor.getValue().y, anchor.getValue().width, anchor.getValue().height); 
     } 

     ImageIO.write(bi, "png", new File("test.png")); 
    } 
} 

回答

相關問題