2014-04-02 55 views
0

我試圖解析這個文件,但沒有從網站下載它。我已經在硬盤上運行這個文件,我可以毫無問題地解析它,但運行它跳轉的腳本。使用pdfminer通過URL解析PDF時使用pdfminer

if not document.is_extractable: 
raise PDFTextExtractionNotAllowed 

我認爲我整合了網址錯誤。

import sys 
import getopt 
import urllib2 
import datetime 
import re 
from pdfminer.pdfparser import PDFParser 
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, PDFConverter, LTContainer, LTText, LTTextBox, LTImage 
from pdfminer.layout import LAParams 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf 
from urllib2 import Request 


# Define a PDF parser function 
def parsePDF(url): 

    # Open the url provided as an argument to the function and read the content 
    open = urllib2.urlopen(Request(url)).read() 

    # Cast to StringIO object 
    from StringIO import StringIO 
    memory_file = StringIO(open) 

    # Create a PDF parser object associated with the StringIO object 
    parser = PDFParser(memory_file) 

    # Create a PDF document object that stores the document structure 
    document = PDFDocument(parser) 

    # Check if the document allows text extraction. If not, abort. 
    if not document.is_extractable: 
    raise PDFTextExtractionNotAllowed 

    # Define parameters to the PDF device objet 
    rsrcmgr = PDFResourceManager() 
    retstr = StringIO() 
    laparams = LAParams() 
    codec = 'utf-8' 

    Create a PDF device object 
    device = PDFDevice(rsrcmgr, retstr, codec = codec, laparams = laparams) 
    # Create a PDF interpreter object 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 

    # Process each page contained in the document 
    for page in PDFPage.create_pages(document): 
     interpreter.process_page(page) 

# Construct the url 
    url = 'http://www.city.pittsburgh.pa.us/police/blotter/blotter_monday.pdf' 

回答

0

建立在你自己的答案,並提供here功能,這應該在URL中從PDF返回一個字符串,而無需下載:

import urllib2 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import TextConverter 
from pdfminer.layout import LAParams 
from pdfminer.pdfpage import PDFPage 
from cStringIO import StringIO 


def pdf_from_url_to_txt(url): 
    rsrcmgr = PDFResourceManager() 
    retstr = StringIO() 
    codec = 'utf-8' 
    laparams = LAParams() 
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
    # Open the url provided as an argument to the function and read the content 
    f = urllib2.urlopen(urllib2.Request(url)).read() 
    # Cast to StringIO object 
    fp = StringIO(f) 
    interpreter = PDFPageInterpreter(rsrcmgr, device) 
    password = "" 
    maxpages = 0 
    caching = True 
    pagenos = set() 
    for page in PDFPage.get_pages(fp, 
            pagenos, 
            maxpages=maxpages, 
            password=password, 
            caching=caching, 
            check_extractable=True): 
     interpreter.process_page(page) 
    fp.close() 
    device.close() 
    str = retstr.getvalue() 
    retstr.close() 
    return str 
+0

誤差:FP = StringIO的(F) 類型錯誤:initial_value必須是str或None,而不是字節 –