2010-10-21 128 views
25

我正在嘗試使用Python來處理一些使用Adobe Acrobat Reader填寫並簽名的PDF表單。如何從Python中填寫的表單中提取PDF字段?

我已經試過:

  • pdfminer演示:它沒有任何傾倒在填寫數據。
  • pyPdf:當我嘗試使用PdfFileReader(f)加載文件時,它最大化了一個核心2分鐘,我放棄並殺死了它。
  • Jython和PDFBox:得到了很好的工作,但啓動時間過長,如果這是我唯一的選擇,我將直接在Java中編寫外部實用程序。

我可以繼續尋找圖書館並嘗試它們,但我希望有人已經有一個有效的解決方案。


更新:根據史蒂芬的答案,我看着pdfminer和它很好的伎倆。

from argparse import ArgumentParser 
import pickle 
import pprint 
from pdfminer.pdfparser import PDFParser, PDFDocument 
from pdfminer.pdftypes import resolve1, PDFObjRef 

def load_form(filename): 
    """Load pdf form contents into a nested list of name/value tuples""" 
    with open(filename, 'rb') as file: 
     parser = PDFParser(file) 
     doc = PDFDocument() 
     parser.set_document(doc) 
     doc.set_parser(parser) 
     doc.initialize() 
     return [load_fields(resolve1(f)) for f in 
        resolve1(doc.catalog['AcroForm'])['Fields']] 

def load_fields(field): 
    """Recursively load form fields""" 
    form = field.get('Kids', None) 
    if form: 
     return [load_fields(resolve1(f)) for f in form] 
    else: 
     # Some field types, like signatures, need extra resolving 
     return (field.get('T').decode('utf-16'), resolve1(field.get('V'))) 

def parse_cli(): 
    """Load command line arguments""" 
    parser = ArgumentParser(description='Dump the form contents of a PDF.') 
    parser.add_argument('file', metavar='pdf_form', 
        help='PDF Form to dump the contents of') 
    parser.add_argument('-o', '--out', help='Write output to file', 
         default=None, metavar='FILE') 
    parser.add_argument('-p', '--pickle', action='store_true', default=False, 
         help='Format output for python consumption') 
    return parser.parse_args() 

def main(): 
    args = parse_cli() 
    form = load_form(args.file) 
    if args.out: 
     with open(args.out, 'w') as outfile: 
      if args.pickle: 
       pickle.dump(form, outfile) 
      else: 
       pp = pprint.PrettyPrinter(indent=2) 
       file.write(pp.pformat(form)) 
    else: 
     if args.pickle: 
      print pickle.dumps(form) 
     else: 
      pp = pprint.PrettyPrinter(indent=2) 
      pp.pprint(form) 

if __name__ == '__main__': 
    main() 
+0

作爲一個說明,我也嘗試使用pdftk作爲外部工具,它沒有超過所有者密碼。 – Olson 2010-10-21 03:09:47

回答

25

你應該能夠pdfminer做到這一點,但它需要一些鑽研pdfminer的內部和有關PDF格式的一些知識(當然WRT形式,但也對PDF格式的內部結構,如「字典「和」間接對象「)。

這個例子可以幫助你對你的方式(我認爲這將簡單的情況下,只有工作,沒有嵌套字段等等)

import sys 
from pdfminer.pdfparser import PDFParser 
from pdfminer.pdfdocument import PDFDocument 
from pdfminer.pdftypes import resolve1 

filename = sys.argv[1] 
fp = open(filename, 'rb') 

parser = PDFParser(fp) 
doc = PDFDocument(parser) 
fields = resolve1(doc.catalog['AcroForm'])['Fields'] 
for i in fields: 
    field = resolve1(i) 
    name, value = field.get('T'), field.get('V') 
    print '{0}: {1}'.format(name, value) 

編輯:忘了提:如果您需要提供一個密碼,傳遞給doc.initialize()

+0

這樣做,謝謝。我看到了網絡演示,並發現我可以看到我想要的內容,如果沒有,我可以跳過它。不僅可以按照我想要的方式完成,它甚至可以處理PdfBox無法處理的簽名字段。 – Olson 2010-10-22 02:25:14

+1

我有一個編碼問題。使用field.get('V')不會正確地編碼特殊字符,如'ü'或'ä'。有沒有人有解決這個問題?將字符串轉換爲unicode會引發解碼錯誤。 – Basil 2012-08-20 09:20:52

+2

在當前版本的pdfminer中,PDFDocument.initialize方法已被刪除。如果你只是刪除該行,這段代碼就可以工作。 – joshua 2014-11-05 22:07:24

3

快速和骯髒的2分鐘的工作;只需使用PDFminer將PDF轉換爲xml,然後抓取所有字段。

from xml.etree import ElementTree 
from pprint import pprint 
import os 

def main(): 
    print "Calling PDFDUMP.py" 
    os.system("dumppdf.py -a FILE.pdf > out.xml") 

    # Preprocess the file to eliminate bad XML. 
    print "Screening the file" 
    o = open("output.xml","w") #open for append 
    for line in open("out.xml"): 
     line = line.replace("&#", "Invalid_XML") #some bad data in xml for formatting info. 
     o.write(line) 
    o.close() 

    print "Opening XML output" 
    tree = ElementTree.parse('output.xml') 
    lastnode = "" 
    lastnode2 = "" 
    list = {} 
    entry = {} 

    for node in tree.iter(): # Run through the tree..   
     # Check if New node 
     if node.tag == "key" and node.text == "T": 
      lastnode = node.tag + node.text 
     elif lastnode == "keyT": 
      for child in node.iter(): 
       entry["ID"] = child.text 
      lastnode = "" 

     if node.tag == "key" and node.text == "V": 
      lastnode2 = node.tag + node.text 
     elif lastnode2 == "keyV": 
      for child in node.iter(): 
       if child.tag == "string": 
        if entry.has_key("ID"): 
         entry["Value"] = child.text 
         list[entry["ID"]] = entry["Value"] 
         entry = {} 
      lastnode2 = "" 

    pprint(list) 

if __name__ == '__main__': 
    main() 

這並不美觀,只是一個簡單的概念證明。我需要爲我正在處理的系統實施它,所以我會將其清理乾淨,但是我認爲我會發布它以防萬一任何人發現它有用。

3

更新PDF礦工(其他城市進口和在第一功能解析/文檔設置)的最新版本

from argparse import ArgumentParser 
import pickle 
import pprint 
from pdfminer.pdfparser import PDFParser 
from pdfminer.pdfdocument import PDFDocument 
from pdfminer.pdftypes import resolve1 
from pdfminer.pdftypes import PDFObjRef 

def load_form(filename): 
    """Load pdf form contents into a nested list of name/value tuples""" 
    with open(filename, 'rb') as file: 
     parser = PDFParser(file) 
     doc = PDFDocument(parser) 
     parser.set_document(doc) 
     #doc.set_parser(parser) 
     doc.initialize() 
     return [load_fields(resolve1(f)) for f in 
      resolve1(doc.catalog['AcroForm'])['Fields']] 

def load_fields(field): 
    """Recursively load form fields""" 
    form = field.get('Kids', None) 
    if form: 
     return [load_fields(resolve1(f)) for f in form] 
    else: 
     # Some field types, like signatures, need extra resolving 
     return (field.get('T').decode('utf-8'), resolve1(field.get('V'))) 

def parse_cli(): 
    """Load command line arguments""" 
    parser = ArgumentParser(description='Dump the form contents of a PDF.') 
    parser.add_argument('file', metavar='pdf_form', 
     help='PDF Form to dump the contents of') 
    parser.add_argument('-o', '--out', help='Write output to file', 
     default=None, metavar='FILE') 
    parser.add_argument('-p', '--pickle', action='store_true', default=False, 
     help='Format output for python consumption') 
    return parser.parse_args() 

def main(): 
    args = parse_cli() 
    form = load_form(args.file) 
    if args.out: 
     with open(args.out, 'w') as outfile: 
      if args.pickle: 
       pickle.dump(form, outfile) 
      else: 
       pp = pprint.PrettyPrinter(indent=2) 
       file.write(pp.pformat(form)) 
    else: 
     if args.pickle: 
      print pickle.dumps(form) 
     else: 
      pp = pprint.PrettyPrinter(indent=2) 
      pp.pprint(form) 

if __name__ == '__main__': 
    main() 
+0

您在哪裏放置文件名以便腳本可以運行? – user2067030 2016-12-22 15:31:16

0

有這些線路上的一個錯字:

file.write(pp.pformat(form)) 

應該是:

outfile.write(pp.pformat(form)) 
3

Python 3。6+:

pip install PyPDF2

# -*- coding: utf-8 -*- 

from collections import OrderedDict 
from PyPDF2 import PdfFileWriter, PdfFileReader 


def _getFields(obj, tree=None, retval=None, fileobj=None): 
    """ 
    Extracts field data if this PDF contains interactive form fields. 
    The *tree* and *retval* parameters are for recursive use. 

    :param fileobj: A file object (usually a text file) to write 
     a report to on all interactive form fields found. 
    :return: A dictionary where each key is a field name, and each 
     value is a :class:`Field<PyPDF2.generic.Field>` object. By 
     default, the mapping name is used for keys. 
    :rtype: dict, or ``None`` if form data could not be located. 
    """ 
    fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name', 
         '/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'} 
    if retval is None: 
     retval = OrderedDict() 
     catalog = obj.trailer["/Root"] 
     # get the AcroForm tree 
     if "/AcroForm" in catalog: 
      tree = catalog["/AcroForm"] 
     else: 
      return None 
    if tree is None: 
     return retval 

    obj._checkKids(tree, retval, fileobj) 
    for attr in fieldAttributes: 
     if attr in tree: 
      # Tree is a field 
      obj._buildField(tree, retval, fileobj, fieldAttributes) 
      break 

    if "/Fields" in tree: 
     fields = tree["/Fields"] 
     for f in fields: 
      field = f.getObject() 
      obj._buildField(field, retval, fileobj, fieldAttributes) 

    return retval 


def get_form_fields(infile): 
    infile = PdfFileReader(open(infile, 'rb')) 
    fields = _getFields(infile) 
    return OrderedDict((k, v.get('/V', '')) for k, v in fields.items()) 



if __name__ == '__main__': 
    from pprint import pprint 

    pdf_file_name = 'FormExample.pdf' 

    pprint(get_form_fields(pdf_file_name)) 
0

Python的PyPDF2包(繼任者pyPdf)非常方便:

import PyPDF2 
f = PyPDF2.PdfFileReader('form.pdf') 
ff = f.getFields() 

然後ffdict包含所有相關形式的信息。

相關問題