2016-11-12 28 views
0

我解析PDF文件使用我PdfTable類提取表數據的不同實例。當我創建一個類實例,然後創建另一個類實例時,似乎第一個類實例file_1.cells被預先添加到第二個類實例file_2.cells。我無法弄清楚爲什麼會發生這種情況,因爲我不認爲我創建了類變量而只是實例變量。出於某種原因,來自set_cells的數據在另一個類實例被實例化時被持久化。發生什麼事?被添加到同一類

from pdfminer.pdfdocument import PDFDocument 
from pdfminer.pdfpage import PDFPage 
from pdfminer.pdfparser import PDFParser 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 
from pdfminer.converter import PDFPageAggregator 
from pdfminer.layout import LAParams, LTTextBox, LTTextBoxHorizontal, LTTextLineHorizontal 
from tabulate import tabulate 
from utils import clean_string 
from collections import namedtuple 


class PdfTable(object): 

    def __init__(self, file_name): 
     self.file_name = file_name 
     self.table_headers = None 
     self.cells = None 
     self.rows = None 

    def process_file(self, file_name): 
     pages = [] 
     with open(file_name, 'rb') as fp: 
      parser = PDFParser(fp) 
      doc = PDFDocument(parser) 
      rsrcmgr = PDFResourceManager() 
      laparams = LAParams() 
      device = PDFPageAggregator(rsrcmgr, laparams=laparams) 
      interpreter = PDFPageInterpreter(rsrcmgr, device) 

      for page in PDFPage.create_pages(doc): 
       interpreter.process_page(page) 
       pages.append(device.get_result()) 

     return pages 

    def set_table_headers(self, page_obj, table_headers={}): 
     values = ('NAME', 'VALUE', 'REFERENCE RANGE') 
     Header = namedtuple('Header', 'name, x0, y0') 

     for obj in page_obj: 
      if isinstance(obj, LTTextLineHorizontal): 
       text = clean_string(obj.get_text()) 
       if text in values: 
        table_headers[text] = Header(text, obj.bbox[0], obj.bbox[1]) 
      elif isinstance(obj, LTTextBoxHorizontal): 
       self.set_table_headers(obj, table_headers) 

     return table_headers 

    def set_cells(self, page, headers, cells=[]): 
     Cell = namedtuple("Cell", "col, text, x0, y0") 
     col = None 
     text = None 

     for obj in page: 
      if isinstance(obj, LTTextLineHorizontal): 
       obj_x0 = obj.bbox[0] 
       obj_y0 = obj.bbox[1] 

       if obj_y0 < headers['NAME'].y0 and (obj_x0 == headers['NAME'].x0 or 
        obj_x0 == headers['VALUE'].x0 or obj_x0 == headers['REFERENCE RANGE'].x0): 

        if obj_x0 == headers['NAME'].x0: 
         col = 'NAME' 
        elif obj_x0 == headers['VALUE'].x0: 
         col = 'VALUE' 
        elif obj_x0 == headers['REFERENCE RANGE'].x0: 
         col = 'REFERENCE RANGE' 

        text = clean_string(obj.get_text()) 
        if text: 
         cells.append(Cell(col, text, obj_x0, obj_y0)) 

      elif isinstance(obj, LTTextBoxHorizontal): 
       self.set_cells(obj, headers, cells) 

     return cells 

    def set_rows(self, cells): 
     Cell = namedtuple("Cell", "col, text, x0, y0") 
     Row = namedtuple('Row', 'test, value, ref_range, y0') 

     name_col = [cell for cell in cells if cell.col == 'NAME'] 
     value_col = [cell for cell in cells if cell.col == 'VALUE'] 
     ref_col = [cell for cell in cells if cell.col == 'REFERENCE RANGE'] 

     # normalize val col with blank cells to match name col length 
     values_y0 = [cell.y0 for cell in value_col] 
     missing_val_cells = [cell.y0 for cell in name_col if cell.y0 not in values_y0] 
     value_col += [Cell('VALUE', None, None, y) for y in missing_val_cells] 

     rows = [Row(name_cell.text, value_cell.text, ref_cell.text, name_cell.y0) 
       for name_cell in name_col for value_cell in value_col 
       for ref_cell in ref_col 
       if name_cell.y0 == value_cell.y0 == ref_cell.y0] 

     return rows 

    def parse_pages(self): 
     pages = self.process_file(self.file_name) 
     self.set_metadata(pages[0]) 

     for page in pages: 
      self.table_headers = self.set_table_headers(page) 
      self.cells = self.set_cells(page, self.table_headers) 

     self.rows = self.set_rows(self.cells) 


if __name__ == "__main__": 
    file_1 = PdfTable("RawData/pdfs/3768958-2.pdf") 
    file_1.parse_pages() 

    print("file_1 cells") 
    print tabulate(file_1.cells, headers="keys", showindex="always") 

    file_2 = PdfTable("RawData/pdfs/3768959.pdf") 
    file_2.parse_pages() 

    print("\nfile_2 cells") 
    print tabulate(file_2.cells, headers="keys", showindex="always") 

file_1.cells

col    text     x0  y0 
-- --------------- --------------- -------- ------- 
0 NAME    TP    42.8571 570.887 
1 NAME    RIN    42.8571 554.172 
2 VALUE   13.5    221.716 570.887 
3 VALUE   1.0    221.716 554.172 
4 REFERENCE RANGE 11.8-14.2 (SEC) 412.555 570.887 
5 REFERENCE RANGE 0.8-1.2   412.555 554.172 

file_2.cells

col    text      x0  y0 
-- --------------- -------------------- -------- ------- 
0 NAME    TP      42.8571 570.887 
1 NAME    RIN     42.8571 554.172 
2 VALUE   13.5     221.716 570.887 
3 VALUE   1.0     221.716 554.172 
4 REFERENCE RANGE 11.8-14.2 (SEC)  412.555 570.887 
5 REFERENCE RANGE 0.8-1.2    412.555 554.172 
6 NAME    RSW     42.8571 570.887 
7 NAME    BCW     42.8571 554.172 
8 VALUE   8.7     221.716 570.887 
9 VALUE   25.6     221.716 554.172 
10 REFERENCE RANGE 4.5-12.5    412.555 570.887 
11 REFERENCE RANGE 14.0-30.0    412.555 554.172 

預計file_2.cells

col    text      x0  y0 
-- --------------- -------------------- -------- ------- 
0 NAME    RSW     42.8571 570.887 
1 NAME    BCW     42.8571 554.172 
2 VALUE   8.7     221.716 570.887 
3 VALUE   25.6     221.716 554.172 
4 REFERENCE RANGE 4.5.-12.5    412.555 570.887 
5 REFERENCE RANGE 14.0-30.0    412.555 554.172 

不僅是預先考慮file_2.cells file_1.cells但處理後file_2,file_1.cells從兩個情況下細胞的結合。

+0

@Daniel羅斯曼,謝謝你找到答案。 – Mox

回答

1

您有可變默認參數table_headers={}cells=[]這可能是問題,或者至少會導致其他問題。這些值在對方法的調用中共享,以便在其他位置反映一個調用中的更改。