2010-10-26 197 views
4

我需要從pdf文件中提取文本(逐字)。從pdf文件中提取文本

import java.io.*; 

import com.itextpdf.text.*; 

import com.itextpdf.text.pdf.*; 

import com.itextpdf.text.pdf.parser.*; 

public class pdf { 

    private static String INPUTFILE = "http://ontology.buffalo.edu/ontology%28PIC%29.pdf" ; 

    private static String OUTPUTFILE = "c:/new3.pdf"; 

    public static void main(String[] args) throws DocumentException, 
      IOException { 

     Document document = new Document(); 

     PdfWriter writer = PdfWriter.getInstance(document, 

     new FileOutputStream(OUTPUTFILE)); 

     document.open(); 

     PdfReader reader = new PdfReader(INPUTFILE); 

     int n = reader.getNumberOfPages(); 

     PdfImportedPage page; 

     // Go through all pages 

     for (int i = 1; i <= n; i++) { 

       page = writer.getImportedPage(reader, i); 

       System.out.println(i); 


       Image instance = Image.getInstance(page); 

       document.add(instance); 

     } 

     document.close(); 


     PdfReader readerN = new PdfReader(OUTPUTFILE); 

     PdfTextExtractor parse = new PdfTextExtractor(); 

for (int i = 1; i <= n; i++) 

System.out.println(parser.getTextFromPage(reader,i)); 


} 

當我編譯的代碼,我有這樣的錯誤:

the constructor PdfTextExtractor is undefined

我該如何解決這個問題?

+1

如果您發現答案正確,則將其標記爲答案,但如果他們無法識別,人們往往會停止幫助。 – Woot4Moo 2010-10-26 18:17:24

回答

8

PDFTextExtractor只包含靜態方法,構造函數是私有的。 itext

你可以把它像這樣:
String myLine = PDFTextExtractor.getTextFromPage(reader, pageNumber)

+0

發佈從[編輯本(http://stackoverflow.com/review/suggested-edits/3010522)(也可能是不正確的):'你的回答讓我只是想糾正正確的方法名稱。它是「PdfTextExtractor」而不是「PDFTextExtractor」 – gunr2171 2013-09-26 15:09:59

-1
// Try Apache PDF Box 
import java.io.FilterInputStream; 
import java.io.InputStream; 

import org.apache.pdfbox.cos.COSDocument; 
import org.apache.pdfbox.pdfparser.PDFParser; 
import org.apache.pdfbox.pdmodel.PDDocument; 
import org.apache.pdfbox.util.PDFTextStripper; 

// Your PDF file 
String filePath = ""; 
InputStream inputStream = null; 

try 
{ 
    inputStream = new FileInputStream(filePath); 
    PDFParser parser = new PDFParser(inputStream); 

    // This will parse the stream and populate the COSDocument object. 
    parser.parse(); 

    // Get the document that was parsed. 
    COSDocument cosDoc = parser.getDocument(); 

    // This class will take a pdf document and strip out all of the text and 
    // ignore the formatting and such. 
    PDFTextStripper pdfStripper = new PDFTextStripper(); 

    // This is the in-memory representation of the PDF document 
    PDDocument pdDoc = new PDDocument(cosDoc); 
    pdfStripper.setStartPage(1); 
    pdfStripper.setEndPage(pdDoc.getNumberOfPages()); 

    // This will return the text of a document. 
    def statementPDF = pdfStripper.getText(pdDoc); 
} 
catch(Exception e) 
{ 
    String errorMessage += "\nUnexpected Exception: " + e.getClass() + "\n" + e.getMessage(); 
    for (trace in e.getStackTrace()) 
    { 
     errorMessage += "\n\t" + trace; 
    } 
} 
finally 
{ 
    if (inputStream != null) 
    { 
     inputStream.close(); 
    } 
} 
0

如果你想獲得所有從PDF文件中的文本,並將其保存到您可以使用下面的代碼的文本文件。

使用pdfutil.jar庫。

import java.io.IOException; 
import java.io.PrintWriter; 

import com.testautomationguru.utility.PDFUtil; 

public class PDFToText{ 

    public static void main(String[] args) { 

     try { 
      String pdfFilePath = "C:\\abc.pdf"; 
      PDFUtil pdfUtil = new PDFUtil(); 
      String content = pdfUtil.getText(pdfFilePath); 
      PrintWriter out = new PrintWriter("C:\\abc.txt"); 
      out.println(content); 
      out.close(); 

     } catch (IOException e) { 

      e.printStackTrace(); 
     } 
    } 

}