2013-08-30 72 views
-1

我做了一個腳本,該腳本將映射我的目錄,給我統計一下...... 這裏的腳本:- Python腳本拋出內存錯誤

import os 
import hashlib 
import platform 
import sys 
import argparse 
import HTML 

class Map(object): 

    def __init__(self,param): 
     self.param_list = param 
     self.slash = self.slash_by_os() 
     self.result_list = [] 
     self.os = "" 


    def calc_md5(self,file_path): 
     with open(file_path) as file_to_check: 
      data = file_to_check.read()  
      md5_returned = hashlib.md5(data).hexdigest() 

     return md5_returned 

    def slash_by_os(self): 
     general_id = platform.system() 
     actual_os = "" 

     if general_id == "Darwin" or general_id == "darwin": 
      actual_os = "UNIX" 
     elif general_id == "Linux" or general_id == "linux": 
      actual_os = "UNIX" 
     elif general_id == "SunOS": 
      actual_os = "UNIX" 
     elif general_id == "Windows" or general_id == "windows": 
      actual_os = "WIN" 
     else: 
      actual_os = general_id 

     if actual_os == "UNIX": 
      return '/' 
     elif actual_os == "WIN": 
      return '\\' 
     else: 
      return '/' 

     self.os = actual_os 

    def what_to_do(self,new_dir): 
     act = [] 
     act.append(new_dir[:-1]) 
     for param in self.param_list: 
      if param == "md5": 
       x = self.calc_md5(new_dir[:-1]) 
       act.append(x) 
      elif param == "size": 
       x = os.stat(new_dir[:-1]).st_size 
       act.append(x) 
      elif param == "access": 
       x = os.stat(new_dir[:-1]).st_atime 
       act.append(x) 
      elif param == "modify": 
       x = os.stat(new_dir[:-1]).st_mtime 
       act.append(x) 
      elif param == "creation": 
        x = os.stat(new_dir[:-1]).st_ctime 
        act.append(x) 

     return act 

    def list_of_files(self ,dir_name ,traversed = [], results = []): 

     dirs = os.listdir(dir_name) 
     if dirs: 
      for f in dirs: 
       new_dir = dir_name + f + self.slash 
       if os.path.isdir(new_dir) and new_dir not in traversed: 
        traversed.append(new_dir) 
        self.list_of_files(new_dir, traversed, results) 
       else: 
        act = self.what_to_do(new_dir) 
        results.append(act) 
     self.result_list = results 
     return results 


def parse_args(): 
    desc = "Welcom To dirmap.py 1.0" 
    parser = argparse.ArgumentParser(description=desc) 
    parser.add_argument('-p','--path', help='Path To Original Directory', required=True) 
    parser.add_argument('-md','--md5', action = 'store_true',help='Show md5 hash of file', required=False) 
    parser.add_argument('-s','--size', action = 'store_true', help='Show size of file', required=False) 
    parser.add_argument('-a','--access', action = 'store_true', help='Show access time of file', required=False) 
    parser.add_argument('-m','--modify', action = 'store_true', help='Show modification time of file', required=False) 
    parser.add_argument('-c','--creation', action = 'store_true', help='Show creation of file', required=False) 

    args = vars(parser.parse_args()) 

    params = [] 
    for key,value in args.iteritems(): 
     if value == True: 
      params.append(key) 

    return args,params 



def main(): 
    args , params = parse_args() 
    dir_path = args['path'] 
    map = Map(params) 
    dir_list = map.list_of_files(dir_path) 

    params.insert(0,"path") 


    htmlcode_dir = HTML.table(dir_list,header_row=params) 
    print htmlcode_dir 

main() 

當我嘗試在中型到大型目錄運行它拋出我MemoryError異常...... ,你可以在這裏看到:

python(2374) malloc: *** mmap(size=140514183884800) failed (error code=12) 
*** error: can't allocate region 
*** set a breakpoint in malloc_error_break to debug 
Traceback (most recent call last): 
    File "dirmap.py", line 132, in <module> 
    main() 
    File "dirmap.py", line 124, in main 
    dir_list = map.list_of_files(dir_path) 
    File "dirmap.py", line 86, in list_of_files 
    self.list_of_files(new_dir, traversed, results) 
    File "dirmap.py", line 86, in list_of_files 
    self.list_of_files(new_dir, traversed, results) 
    File "dirmap.py", line 86, in list_of_files 
    self.list_of_files(new_dir, traversed, results) 
    File "dirmap.py", line 88, in list_of_files 
    act = self.what_to_do(new_dir) 
    File "dirmap.py", line 60, in what_to_do 
    x = self.calc_md5(new_dir[:-1]) 
    File "dirmap.py", line 25, in calc_md5 
    data = file_to_check.read() 
MemoryError 

什麼想法?

+0

你可以粘貼回溯?我也建議不要[使用列表作爲關鍵字參數的默認值](http://pythonconquerstheuniverse.wordpress.com/category/python-gotchas/)(正如你在'list_of_files'中所做的那樣)。 – beetea

+0

我真的不知道什麼是回溯..但是如果沒有列出我應該使用什麼? –

+0

@beetea:回溯在那裏,只是沒有格式化得很好。我修復了它。 –

回答

4

您正在一次性將大文件讀入內存。不這樣做,在成批讀它,當您去更新哈希:

def calc_md5(self,file_path): 
    hash = hashlib.md5() 
    with open(file_path, 'rb') as file_to_check: 
     for chunk in iter(lambda: file_to_check.read(4096), ''):  
      hash.update(chunk) 

    return hash.hexdigest() 

這將打開二進制模式下的文件,避免瞭解釋不同的行結束約定(這將改變哈希) 。

上述代碼使用iter() function的雙參數形式,其中第二個參數是sentinel值;當可調用的第一個參數返回第二個參數時,迭代停止。到達EOF時,Python文件對象返回一個空字符串。

+0

那麼我應該使用哪種方法?塊或線? –

+0

@FernandoRetimo:在仔細考慮了一下之後,再加上大塊和二進制閱讀。例如,在文本模式下打開文件可以改變行結束的解釋方式。 –

+0

我會研究它嘗試它,並告訴它的結果。 –

1

您可能正在運行一個大文件,無法將其全部讀入到calc_md5()的內存中。使用緩衝方法