0
我解析PDF文件使用我PdfTable類提取表數據的不同實例。當我創建一個類實例,然後創建另一個類實例時,似乎第一個類實例file_1.cells被預先添加到第二個類實例file_2.cells。我無法弄清楚爲什麼會發生這種情況,因爲我不認爲我創建了類變量而只是實例變量。出於某種原因,來自set_cells的數據在另一個類實例被實例化時被持久化。發生什麼事?被添加到同一類
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextBoxHorizontal, LTTextLineHorizontal
from tabulate import tabulate
from utils import clean_string
from collections import namedtuple
class PdfTable(object):
def __init__(self, file_name):
self.file_name = file_name
self.table_headers = None
self.cells = None
self.rows = None
def process_file(self, file_name):
pages = []
with open(file_name, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
pages.append(device.get_result())
return pages
def set_table_headers(self, page_obj, table_headers={}):
values = ('NAME', 'VALUE', 'REFERENCE RANGE')
Header = namedtuple('Header', 'name, x0, y0')
for obj in page_obj:
if isinstance(obj, LTTextLineHorizontal):
text = clean_string(obj.get_text())
if text in values:
table_headers[text] = Header(text, obj.bbox[0], obj.bbox[1])
elif isinstance(obj, LTTextBoxHorizontal):
self.set_table_headers(obj, table_headers)
return table_headers
def set_cells(self, page, headers, cells=[]):
Cell = namedtuple("Cell", "col, text, x0, y0")
col = None
text = None
for obj in page:
if isinstance(obj, LTTextLineHorizontal):
obj_x0 = obj.bbox[0]
obj_y0 = obj.bbox[1]
if obj_y0 < headers['NAME'].y0 and (obj_x0 == headers['NAME'].x0 or
obj_x0 == headers['VALUE'].x0 or obj_x0 == headers['REFERENCE RANGE'].x0):
if obj_x0 == headers['NAME'].x0:
col = 'NAME'
elif obj_x0 == headers['VALUE'].x0:
col = 'VALUE'
elif obj_x0 == headers['REFERENCE RANGE'].x0:
col = 'REFERENCE RANGE'
text = clean_string(obj.get_text())
if text:
cells.append(Cell(col, text, obj_x0, obj_y0))
elif isinstance(obj, LTTextBoxHorizontal):
self.set_cells(obj, headers, cells)
return cells
def set_rows(self, cells):
Cell = namedtuple("Cell", "col, text, x0, y0")
Row = namedtuple('Row', 'test, value, ref_range, y0')
name_col = [cell for cell in cells if cell.col == 'NAME']
value_col = [cell for cell in cells if cell.col == 'VALUE']
ref_col = [cell for cell in cells if cell.col == 'REFERENCE RANGE']
# normalize val col with blank cells to match name col length
values_y0 = [cell.y0 for cell in value_col]
missing_val_cells = [cell.y0 for cell in name_col if cell.y0 not in values_y0]
value_col += [Cell('VALUE', None, None, y) for y in missing_val_cells]
rows = [Row(name_cell.text, value_cell.text, ref_cell.text, name_cell.y0)
for name_cell in name_col for value_cell in value_col
for ref_cell in ref_col
if name_cell.y0 == value_cell.y0 == ref_cell.y0]
return rows
def parse_pages(self):
pages = self.process_file(self.file_name)
self.set_metadata(pages[0])
for page in pages:
self.table_headers = self.set_table_headers(page)
self.cells = self.set_cells(page, self.table_headers)
self.rows = self.set_rows(self.cells)
if __name__ == "__main__":
file_1 = PdfTable("RawData/pdfs/3768958-2.pdf")
file_1.parse_pages()
print("file_1 cells")
print tabulate(file_1.cells, headers="keys", showindex="always")
file_2 = PdfTable("RawData/pdfs/3768959.pdf")
file_2.parse_pages()
print("\nfile_2 cells")
print tabulate(file_2.cells, headers="keys", showindex="always")
file_1.cells
col text x0 y0
-- --------------- --------------- -------- -------
0 NAME TP 42.8571 570.887
1 NAME RIN 42.8571 554.172
2 VALUE 13.5 221.716 570.887
3 VALUE 1.0 221.716 554.172
4 REFERENCE RANGE 11.8-14.2 (SEC) 412.555 570.887
5 REFERENCE RANGE 0.8-1.2 412.555 554.172
file_2.cells
col text x0 y0
-- --------------- -------------------- -------- -------
0 NAME TP 42.8571 570.887
1 NAME RIN 42.8571 554.172
2 VALUE 13.5 221.716 570.887
3 VALUE 1.0 221.716 554.172
4 REFERENCE RANGE 11.8-14.2 (SEC) 412.555 570.887
5 REFERENCE RANGE 0.8-1.2 412.555 554.172
6 NAME RSW 42.8571 570.887
7 NAME BCW 42.8571 554.172
8 VALUE 8.7 221.716 570.887
9 VALUE 25.6 221.716 554.172
10 REFERENCE RANGE 4.5-12.5 412.555 570.887
11 REFERENCE RANGE 14.0-30.0 412.555 554.172
預計file_2.cells
col text x0 y0
-- --------------- -------------------- -------- -------
0 NAME RSW 42.8571 570.887
1 NAME BCW 42.8571 554.172
2 VALUE 8.7 221.716 570.887
3 VALUE 25.6 221.716 554.172
4 REFERENCE RANGE 4.5.-12.5 412.555 570.887
5 REFERENCE RANGE 14.0-30.0 412.555 554.172
不僅是預先考慮file_2.cells file_1.cells但處理後file_2,file_1.cells從兩個情況下細胞的結合。
@Daniel羅斯曼,謝謝你找到答案。 – Mox