2015-06-07 43 views
0

我想僅提取PDF文檔上突出顯示的文本。它可以在PC上運行,但是當我在Android上使用它時會失敗。 PDFBox不直接在Android上工作,所以我使用Android的Birdbrain2/PdfBox-Android。PDFbox無法在android上提取區域

下面是PC代碼工作

import java.awt.geom.Rectangle2D; 
import java.io.File; 
import java.util.List; 

import org.apache.pdfbox.pdmodel.PDDocument; 
import org.apache.pdfbox.pdmodel.PDPage; 
import org.apache.pdfbox.pdmodel.common.PDRectangle; 
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; 
import org.apache.pdfbox.util.PDFTextStripperByArea; 

public class ExtractHighlights { 
    public static void main(String args[]) {   
     System.out.println(extractHighlights("sample.pdf")); 
    } 

    public static String extractHighlights(String fileName){ 
     String extractedText = ""; 
     try { 
      PDDocument pddDocument = PDDocument.load(new File(fileName)); 
      List allPages = pddDocument.getDocumentCatalog().getAllPages();   
      for (int i = 0; i < allPages.size(); i++) {    
       PDPage page = (PDPage) allPages.get(i); 
       List<PDAnnotation> la = page.getAnnotations(); 
       if (la.size() < 1) { 
        continue; 
       } 

       for (PDAnnotation pda : la) { 
        PDFTextStripperByArea stripper = new PDFTextStripperByArea(); 
        stripper.setSortByPosition(true); 

        PDRectangle rect = pda.getRectangle(); 
        float x = rect.getLowerLeftX(); 
        float y = rect.getUpperRightY(); 
        float width = rect.getWidth(); 
        float height = rect.getHeight(); 
        int rotation = page.findRotation(); 
        if (rotation == 0) { 
         PDRectangle pageSize = page.findMediaBox(); 
         y = pageSize.getHeight() - y; 
        } 

        Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, 
          width, height); 
        stripper.addRegion("0", awtRect); 
        stripper.extractRegions(page); 
        String highlight = stripper.getTextForRegion("0").trim(); 
        if(highlight.length() == 0) continue; 
        extractedText += highlight.substring(0,highlight.length()-2)+" "; 
       }    
      } 
      pddDocument.close(); 
      //System.out.println(extractedText); 
     } catch (Exception ex) { 
      ex.printStackTrace(); 
     } 
     return extractedText; 
    } 
} 

這裏是行不通

@Override 
     protected String doInBackground(String... strings) { 
      String extractedText = ""; 

      try { 
       Log.i("ExtractHighlights","Started"); 
       PDDocument pddDocument = PDDocument.load(new File(strings[0])); 
       PDPageTree allPages = pddDocument.getDocumentCatalog().getPages(); 
       int totalPages = allPages.getCount(); 
       int pageNumber = 0; 
       for (PDPage page:allPages) { 
        publishProgress(pageNumber++,totalPages); 
        Log.i("ExtractHighlights", "Reading page"); 
        List<PDAnnotation> la = page.getAnnotations(); 
        if (la.size() < 1) { 
         continue; 
        } 

        for (PDAnnotation pda : la) { 
         Log.i("ExtractHighlights","Annotation found"); 
         PDFTextStripperByArea stripper = new PDFTextStripperByArea(); 
         stripper.setSortByPosition(true); 

         Log.i("ExtractHighlights","Getting rectangle"); 
         PDRectangle rect = pda.getRectangle(); 
         float x = rect.getLowerLeftX(); 
         float y = rect.getUpperRightY(); 
         float width = rect.getWidth(); 
         float height = rect.getHeight(); 

         RectF region = new RectF(x,y,width,height); 
         stripper.addRegion("0",region); 
         Log.i("ExtractHighlights","Extracting regions"); 
         stripper.extractRegions(page); 
         Log.i("ExtractHighlights","Getting text from region"); 
         String highlight = stripper.getTextForRegion("0").trim(); 
         Log.i("ExtractHighlights",highlight); 
         if(highlight.length() == 0) continue; 
         extractedText += highlight.substring(0,highlight.length()-2)+" "; 
        } 
        Log.i("ExtractHighlights","Page done"); 
       } 
       pddDocument.close(); 
       Log.i("ExtractHighlights","Document closed"); 
      } catch (Exception ex) { 
       ex.printStackTrace(); 
      } 

      return extractedText; 
     } 

還需要很長的時間在Android和Android假定程序已經崩潰了Android代碼。

我可以嘗試將整個PDF轉換爲文本,但是如何知道哪些文本突出顯示?

+0

有一個問題「#5 PDFTextStripperByArea無法提取文本(字體問題?) 由RainHeart257 7個月前開啓的Birdbrain2/PdfBox。可能你的問題是相關的? – mkl

+0

是的,這是問題。你能推薦一些除PDFbox以外的其他API嗎? –

回答

1

正如@mkl指出的那樣,這是一個已知的錯誤。 所以我用this guy的PDFBox端口。有效!

+0

我很高興它爲你工作。但是這個項目一年沒有活動。因此,請關注Birdbrain2,並可能在幾個月後重試:-) –