2017-07-18 161 views
0

我有一個名爲「example1.pdf」的pdf。我想逐行閱讀它,第一行是「Hello my name is jhon」。所以我想要一個字符串稱爲行。 我嘗試使用pdfTextStripper和pdfBox,但沒有辦法做到這一點。 任何幫助將appriciated如何逐行閱讀pdf

+2

相關的代碼添加到你的問題。你也應該知道PDF沒有模式,所以創建一個通用的解決方案並不容易。 –

+0

您可能需要問如何從pdf文檔中提取所有字符串 – simar

+0

simar我能夠將pdf文檔中的所有字符串作爲單個字符串提取。我想逐行讀取它。 – jagga

回答

1
import org.apache.pdfbox.pdmodel.PDDocument; 
import org.apache.pdfbox.text.PDFTextStripper; 
import org.apache.pdfbox.text.TextPosition; 

import java.io.ByteArrayOutputStream; 
import java.io.File; 
import java.io.IOException; 
import java.io.OutputStreamWriter; 
import java.io.Writer; 
import java.util.ArrayList; 
import java.util.List; 

/** 
* This is an example on how to extract text line by line from pdf document 
*/ 
public class GetLinesFromPDF extends PDFTextStripper { 

    static List<String> lines = new ArrayList<String>(); 

    public GetLinesFromPDF() throws IOException { 
    } 

    /** 
    * @throws IOException If there is an error parsing the document. 
    */ 
    public static void main(String[] args) throws IOException { 
     PDDocument document = null; 
     String fileName = "example1.pdf"; 
     try { 
      document = PDDocument.load(new File(fileName)); 
      PDFTextStripper stripper = new GetLinesFromPDF(); 
      stripper.setSortByPosition(true); 
      stripper.setStartPage(0); 
      stripper.setEndPage(document.getNumberOfPages()); 

      Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream()); 
      stripper.writeText(document, dummy); 

      // print lines 
      for(String line:lines){ 
       System.out.println(line);    
      } 
     } 
     finally { 
      if(document != null) { 
       document.close(); 
      } 
     } 
    } 

    /** 
    * Override the default functionality of PDFTextStripper.writeString() 
    */ 
    @Override 
    protected void writeString(String str, List<TextPosition> textPositions) throws IOException { 
     lines.add(str); 
     // you may process the line here itself, as and when it is obtained 
    } 
} 

參考 - extract text line by line from pdf