2015-06-04 99 views
4

我在一個程序,提取一個PDF文件文本在一個特定的區域工作,我使用Java和的iText庫enter image description here 現在,我可以通過使用此代碼錄音面積配位提取數據:Java:如何使用iText從PDF文件中選擇區域提取文本?

import java.io.IOException; 
import com.itextpdf.text.Rectangle; 
import com.itextpdf.text.pdf.PdfReader; 
import com.itextpdf.text.pdf.parser.FilteredTextRenderListener; 
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy; 
import com.itextpdf.text.pdf.parser.PdfTextExtractor; 
import com.itextpdf.text.pdf.parser.RegionTextRenderFilter; 
import com.itextpdf.text.pdf.parser.RenderFilter; 
import com.itextpdf.text.pdf.parser.TextExtractionStrategy; 

/** 
* Créer par Malek Boubakri le 03/06/2015 à 15:45. 
*/ 

public class ExtractPageContentArea { 
// 
    public void parsePdf(float x,float y,float width,float height,String pdf) throws IOException { 
     PdfReader reader = new PdfReader(pdf); 
     Rectangle rect = new Rectangle(x, y, width, height); 
     RenderFilter filter = new RegionTextRenderFilter(rect); 
     TextExtractionStrategy strategy; 
     for (int i = 1; i <= reader.getNumberOfPages(); i++) { 
      strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), filter); 
      System.out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy)); 
     } 
     reader.close(); 
    } 
} 

這個代碼可以繪製矩形和使用保存需要配位

import java.awt.BorderLayout; 
import java.awt.Graphics; 
import java.awt.Rectangle; 
import java.awt.event.MouseEvent; 
import java.awt.event.MouseListener; 
import java.awt.event.MouseMotionListener; 
import java.util.ArrayList; 

import javax.swing.JFrame; 
import javax.swing.JLabel; 
import javax.swing.SwingConstants; 

public class MouseTracker extends JFrame implements MouseListener, MouseMotionListener { 

    private static final long serialVersionUID = 1L; 
    private final JLabel mousePosition; 
    int x1, x2, y1, y2; 
    int w, h; 
    private final JLabel recStart; 
    private final JLabel recStop; 
    private final JLabel cords; // set up GUI and register mouse event handlers 
    private final ArrayList<Rectangle> rectangles = new ArrayList<Rectangle>(); 
    private boolean isNewRect = true; 

    public MouseTracker() { 
     super("Rectangle Drawer"); 

     this.mousePosition = new JLabel(); 
     this.mousePosition.setHorizontalAlignment(SwingConstants.CENTER); 
     getContentPane().add(this.mousePosition, BorderLayout.CENTER); 

     JLabel text1 = new JLabel(); 
     text1.setText("At the center the mouse pointer's coordinates will be displayed."); 
     getContentPane().add(text1, BorderLayout.SOUTH); 

     this.recStart = new JLabel(); 
     getContentPane().add(this.recStart, BorderLayout.WEST); 

     this.recStop = new JLabel(); 
     getContentPane().add(this.recStop, BorderLayout.EAST); 

     this.cords = new JLabel(); 
     getContentPane().add(this.cords, BorderLayout.NORTH); 

     addMouseListener(this); // listens for own mouse and 
     addMouseMotionListener(this); // mouse-motion events 

     setSize(800, 600); 
     setVisible(true); 

    } 

// MouseListener event handlers // handle event when mouse released immediately after press 
    public void mouseClicked(final MouseEvent event) { 
     this.mousePosition.setText("Clicked at [" + event.getX() + ", " + event.getY() + "]"); 

     repaint(); 
    } 

// handle event when mouse pressed 
    public void mousePressed(final MouseEvent event) { 

     this.mousePosition.setText("Pressed at [" + (this.x1 = event.getX()) + ", " + (this.y1 = event.getY()) + "]"); 

     this.recStart.setText("Start: [" + this.x1 + ", " + this.y1 + "]"); 

     repaint(); 
    } 

// handle event when mouse released after dragging 
    public void mouseReleased(final MouseEvent event) { 
     this.mousePosition.setText("Released at [" + (this.x2 = event.getX()) + ", " + (this.y2 = event.getY()) + "]"); 

     this.recStop.setText("End: [" + this.x2 + ", " + this.y2 + "]"); 

     Rectangle rectangle = getRectangleFromPoints(); 

     this.rectangles.add(rectangle); 

     this.w = this.h = this.x1 = this.y1 = this.x2 = this.y2 = 0; 
     this.isNewRect = true; 

     repaint(); 
    } 

    private Rectangle getRectangleFromPoints() { 
     int width = this.x1 - this.x2; 
     int height = this.y1 - this.y2; 
     Rectangle rectangle = new Rectangle(width < 0 ? this.x1 
      : this.x2, height < 0 ? this.y1 
      : this.y2, Math.abs(width), Math.abs(height)); 

     return rectangle; 
    } 

// handle event when mouse enters area 
    public void mouseEntered(final MouseEvent event) { 
     this.mousePosition.setText("Mouse entered at [" + event.getX() + ", " + event.getY() + "]"); 
     repaint(); 
    } 

// handle event when mouse exits area 
    public void mouseExited(final MouseEvent event) { 
     this.mousePosition.setText("Mouse outside window"); 
     repaint(); 
    } 

// MouseMotionListener event handlers // handle event when user drags mouse with button pressed 
    public void mouseDragged(final MouseEvent event) { 
     this.mousePosition.setText("Dragged at [" + (this.x2 = event.getX()) + ", " + (this.y2 = event.getY()) + "]"); // call repaint which calls paint repaint(); 

     this.isNewRect = false; 

     repaint(); 
    } 

// handle event when user moves mouse 
    public void mouseMoved(final MouseEvent event) { 
     this.mousePosition.setText("Moved at [" + event.getX() + ", " + event.getY() + "]"); 
     repaint(); 
    } 

    @Override 
    public void paint(final Graphics g) { 
     super.paint(g); // clear the frame surface 
     g.drawString("Start Rec Here", this.x1, this.y1); 
     g.drawString("End Rec Here", this.x2, this.y2); 

     Rectangle newRectangle = getRectangleFromPoints(); 
     if (!this.isNewRect) { 
      g.drawRect(newRectangle.x, newRectangle.y, newRectangle.width, newRectangle.height); 
     } 

     for(Rectangle rectangle : this.rectangles) { 
      g.drawRect(rectangle.x, rectangle.y, rectangle.width, rectangle.height); 
     } 

     this.cords.setText("w = " + this.w + ", h = " + this.h); 

    } 

    public static void main(final String args[]) { 
     MouseTracker application = new MouseTracker(); 
     application.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); 
    } 

} 

我想要使用這些協調來指定PDF文件中的區域,我真的不知道如何合併這兩個功能,如何將繪圖空間放在文檔上方nt以及如何使用文本協調配合矩形協調。

  • 如何得出上述另一面板?

  • 我應該將PDF轉換爲圖像並將其放在後面去做?

  • 如果我應該請任何人都可以提出一個好的和免費的OCR庫!

如果有什麼模糊只是評論! 任何人都可以把我放在路上!因爲我真的迷路了。

等待你help..and感謝(對不起我的英文不好

回答

2

你有一個非常有趣的問題和挑戰性的項目。這個「答案」可能會提供一些有用的想法,但它不是一個完整的解決方案。您可以使用所謂的glass pane來繪製其他組件的頂部。

我認爲您需要決定的最重要的是哪些庫最適合您的項目。 iText library非常好,並提供各種pdf功能,如您在問題中顯示的文本提取。

但據我所知,在iText中不支持pdf查看。您可以使用像ICEpdf這樣的庫(請參閱this example)。如果ICEpdf可以支持文本提取,那將是非常好的,所以您可以使用一個庫而不是使用ICEpdf與iText或OCR一起工作(以及處理諸如在ICEpdf中縮放PDF以及在獲取文本時對其進行補償等問題)。

我不知道你是否能提取與ICEpdf文字,所以iText的,目前仍處在下面的示例代碼用於認爲:

// File ExtractSelectionFromPdf.java 

import com.itextpdf.text.Rectangle; 
import com.itextpdf.text.pdf.PdfReader; 
import com.itextpdf.text.pdf.parser.*; 

import java.awt.Container; 
import java.awt.Point; 
import java.io.IOException; 
import javax.swing.*; 

public class ExtractSelectionFromPdf { 
    private static String filePath = "[file path to a pdf file]"; 

    private PdfViewer pdfViewer; 

    public static void main(final String[] arguments) { 
     SwingUtilities.invokeLater(() -> new ExtractSelectionFromPdf().launchGUI()); 
    } 

    private void launchGUI() { 
     final JFrame frame = new JFrame("Extract selected text from a pdf"); 
     frame.setDefaultCloseOperation(WindowConstants.EXIT_ON_CLOSE); 
     final Container contentPane = frame.getContentPane(); 

     pdfViewer = new PdfViewer(); 
     contentPane.add(pdfViewer); 

     pdfViewer.openDocument(filePath); 

     final CustomGlassPane customGlassPane = new CustomGlassPane(this, contentPane); 
     frame.setGlassPane(customGlassPane); 
     customGlassPane.setVisible(true); 

     frame.setBounds(60, 10, 1800, 1000); 
     frame.setVisible(true); 
    } 

    public void handleSelection(final Point topLeft, final Point bottomRight) { 
     final int width = bottomRight.x - topLeft.x; 
     final int height = bottomRight.y - topLeft.x; 
     final String text = parsePdf(topLeft.x, topLeft.y, width, height, filePath); 
     System.out.println("text: " + text); 
    } 

    public String parsePdf(final int x, final int y, final int width, final int height, 
          final String pdfFilePath) { 
     String text = null; 

     try { 
      final PdfReader pdfReader = new PdfReader(pdfFilePath); 
      final int pageNumber = pdfViewer.getCurrentPageNumber() + 1; 
      System.out.println("Page number: " + pageNumber); 
      final Rectangle selection = new Rectangle(x, y, width, height); 
      final RenderFilter renderFilter = new RegionTextRenderFilter(selection); 
      final LocationTextExtractionStrategy delegate 
        = new LocationTextExtractionStrategy(); 
      final TextExtractionStrategy extractionStrategy 
        = new FilteredTextRenderListener(delegate, renderFilter); 
      text = PdfTextExtractor.getTextFromPage(pdfReader, pageNumber, 
                extractionStrategy); 
      pdfReader.close(); 
     } catch (final IOException e) { 
      e.printStackTrace(); 
     } 

     return text; 
    } 
} 


// File PdfViewer.java 

import java.util.ResourceBundle; 
import javax.swing.*; 
import org.icepdf.ri.common.*; 
import org.icepdf.ri.common.views.DocumentViewController; 
import org.icepdf.ri.util.PropertiesManager; 

public class PdfViewer extends JPanel { 
    private final SwingController controller; 

    public PdfViewer() { 
     controller = new SwingController(); 
     controller.setIsEmbeddedComponent(true); 

     final String bundleName = PropertiesManager.DEFAULT_MESSAGE_BUNDLE; 
     final ResourceBundle messageBundle = ResourceBundle.getBundle(bundleName); 
     final Properties systemProperties = System.getProperties(); 
     final PropertiesManager properties = new PropertiesManager(systemProperties, 
                    messageBundle); 

     properties.set(PropertiesManager.PROPERTY_DEFAULT_ZOOM_LEVEL, "1"); 

     final SwingViewBuilder factory = new SwingViewBuilder(controller, properties); 

     final DocumentViewController viewController 
       = controller.getDocumentViewController(); 
     viewController.setAnnotationCallback(new MyAnnotationCallback(viewController)); 

     final JScrollPane scrollPane = new JScrollPane(factory.buildViewerPanel()); 
     final int horizontalPolicy = ScrollPaneConstants.HORIZONTAL_SCROLLBAR_ALWAYS; 
     final int verticalPolicy = ScrollPaneConstants.VERTICAL_SCROLLBAR_ALWAYS; 
     scrollPane.setHorizontalScrollBarPolicy(horizontalPolicy); 
     scrollPane.setVerticalScrollBarPolicy(verticalPolicy); 
     add(scrollPane); 
    } 

    public void openDocument(final String filePath) { 
     controller.openDocument(filePath); 
    } 

    public int getCurrentPageNumber() { 
     return controller.getCurrentPageNumber(); 
    } 
} 


// File CustomGlassPane.java 

import java.awt.*; 
import javax.swing.JComponent; 

public class CustomGlassPane extends JComponent { 
    private Point topLeftPoint; 
    private Point bottomRightPoint; 

    public CustomGlassPane(final ExtractSelectionFromPdf extractSelectionFromPdf, 
          final Container contentPane) { 
     final MouseEventsListener listener 
       = new MouseEventsListener(extractSelectionFromPdf, this, contentPane); 
     addMouseListener(listener); 
     addMouseMotionListener(listener); 
    } 

    public void setSelection(final Point topLeftPoint, final Point bottomRightPoint) { 
     this.topLeftPoint = topLeftPoint; 
     this.bottomRightPoint = bottomRightPoint; 
    } 

    protected void paintComponent(final Graphics graphics) { 
     if (topLeftPoint != null && bottomRightPoint != null) { 
      graphics.setColor(Color.BLACK); 
      graphics.drawRect(topLeftPoint.x, 
           topLeftPoint.y, 
           bottomRightPoint.x - topLeftPoint.x, 
           bottomRightPoint.y - topLeftPoint.y); 
     } 
    } 
} 


// File MouseEventsListener.java 

import java.awt.*; 
import java.awt.event.MouseEvent; 
import javax.swing.SwingUtilities; 
import javax.swing.event.MouseInputAdapter; 

public class MouseEventsListener extends MouseInputAdapter { 
    private ExtractSelectionFromPdf extractSelectionFromPdf; 
    private CustomGlassPane customGlassPane; 
    private Container contentPane; 
    private Point topLeftPoint; 
    private Point bottomRightPoint; 

    public MouseEventsListener(final ExtractSelectionFromPdf extractSelectionFromPdf, 
           final CustomGlassPane customGlassPane, 
           final Container contentPane) { 
     this.extractSelectionFromPdf = extractSelectionFromPdf; 
     this.customGlassPane = customGlassPane; 
     this.contentPane = contentPane; 
    } 

    public void mousePressed(final MouseEvent mouseEvent) { 
     topLeftPoint = mouseEvent.getPoint(); 

     redispatchMouseEvent(mouseEvent); 
    } 

    public void mouseDragged(final MouseEvent mouseEvent) { 
     bottomRightPoint = mouseEvent.getPoint(); 

     redispatchMouseEvent(mouseEvent, topLeftPoint != null, false); 
    } 

    public void mouseReleased(final MouseEvent mouseEvent) { 
     bottomRightPoint = mouseEvent.getPoint(); 

     redispatchMouseEvent(mouseEvent, true, true); 
    } 

    public void mouseMoved(final MouseEvent mouseEvent) { 
     redispatchMouseEvent(mouseEvent); 
    } 

    public void mouseClicked(final MouseEvent mouseEvent) { 
     redispatchMouseEvent(mouseEvent); 
    } 

    public void mouseEntered(final MouseEvent mouseEvent) { 
     redispatchMouseEvent(mouseEvent); 
    } 

    public void mouseExited(final MouseEvent mouseEvent) { 
     redispatchMouseEvent(mouseEvent); 
    } 

    private void redispatchMouseEvent(final MouseEvent mouseEvent) { 
     redispatchMouseEvent(mouseEvent, false, false); 
    } 

    private void redispatchMouseEvent(final MouseEvent mouseEvent, 
             final boolean repaint, 
             final boolean extract) { 
     final Point glassPanePoint = mouseEvent.getPoint(); 
     final Point containerPoint = SwingUtilities.convertPoint(customGlassPane, 
                   glassPanePoint, 
                   contentPane); 

     if (containerPoint.y >= 0) { 
      final Component component 
        = SwingUtilities.getDeepestComponentAt(contentPane, 
                  containerPoint.x, 
                  containerPoint.y); 

      if (component != null) { 
       final Point componentPoint 
         = SwingUtilities.convertPoint(customGlassPane, 
                 glassPanePoint, 
                 component); 

       // Forward events to the component under the glass pane. 
       component.dispatchEvent(new MouseEvent(component, 
                 mouseEvent.getID(), 
                 mouseEvent.getWhen(), 
                 mouseEvent.getModifiers(), 
                 componentPoint.x, 
                 componentPoint.y, 
                 mouseEvent.getClickCount(), 
                 mouseEvent.isPopupTrigger())); 
      } 
     } 

     // Update the glass pane if requested. 
     if (repaint) { 
      if (extract) { 
       extractSelectionFromPdf.handleSelection(topLeftPoint, bottomRightPoint); 

       topLeftPoint = null; 
       bottomRightPoint = null; 
      } 

      customGlassPane.setSelection(topLeftPoint, bottomRightPoint); 
      customGlassPane.repaint(); 
     } 
    } 
} 

代碼的玻璃板上面部分是由靈感GlassPaneDemo example。在上面的代碼

已知的遺留問題:

  • 由於某種原因,向下滾動PDF閱讀器的按鈕已被點擊一次向上/向下翻頁之前和向上/向下箭頭鍵的工作。
  • 目前實際上被提取的文本似乎是選擇矩形下方。
+0

感謝名單,@Freek德Bruijin我的工作就可以了,現在..我希望它的作品畢竟:(否則我有另一個想法!plzz圍繞:) –

+0

多次嘗試後,我confermed的「玻璃面板「不是我的應用程序的正確解決方案。因爲它不能超過特定的區域或組件。它只能用於根目錄。 我現在正在處理另一個解決方案,並且everthing正確。感謝名單再次@Freek德布魯因 –

2

多次嘗試後,我confermed毫無疑問的是,「玻璃面板」不是我的應用程序或任何像這樣的應用程序的解決方案。這是因爲:

  • 不能高於特定區域或部件。

  • 它可用只有根窗格 ..

  • 最好的辦法閱讀PDF文件其轉換爲圖像,然後讀取它在ImagePane。

  • 在這種情況下不需要OCR ..

我現在工作的另一個解決方案和寄託都發展順利。 如果任何人正在這樣的項目上工作,請評論,標記或提高問題,我感覺準備好了任何細節。

+2

我也有同樣的情況,所以這將是有益的,如果你可以分享其他工作完美的解決方案 –

+1

很高興爲您服務,請註明您有問題的部分,再或者甚至更好,你評我可以發佈問題並將鏈接傳遞給我。 شكرا.. –

相關問題