2013-05-15 50 views
1

如何從SGE獲取(最近)失敗作業的列表(失敗= 100或exit_status = 137)?從則qacct幫助:如何從SGE獲取失敗作業的列表

[-j [job_id|job_name|pattern]] list all [matching] jobs 

如何使用模式?我嘗試了以下,不起作用。

qacct -j failed=100 

回答

4

在這種情況下的「模式」是指與作業名稱匹配的簡單匹配表達式,例如, qacct -j 'myjob*'

qacct遺憾的是沒有你要找的過濾能力 - 它能夠在複雜作業屬性過濾器,但不喜歡exit_statusfailed根本的。

您可以從SGE會計文件(假設您有權訪問它)中檢索該信息,只需做一點工作即可。當SGE完成工作時,它會向$SGE_ROOT/$SGE_CELL/common/accounting寫出一條簡單記錄 - 這是qacct讀取的文件。你要檢查你的qmaster假針對您GridEngine版本細節accounting(5)手冊頁,但在您的會計文件中的作業記錄應更或這樣少的樣子:

all.q:myexechost:group:user:myjobstep16:1126971:sge:0:1369755166:1369768897:1369769771:0:0:874:796.564903:30.676336:15788.000000:0:0:0:0:17009:2:0:47987400.000000:34033048:0:0:0:9468:27604:NONE:defaultdepartment:NONE:1:0:827.241239:96.445328:39.111400:-q all.q:0.000000:NONE:237133824.000000:0:0 

在這個特殊的記錄,不合格和exit_status分別是第12和第13個字段。對於一個快速和骯髒的「最近的失敗」列表中,我們可以利用這些用領域6(作業ID)和11(作業結束時間),像這樣一起發現任何故障在最近的100個作業:

$ cut -d':' -f6,11,12,13 $SGE_ROOT/$SGE_CELL/common/accounting|sort -t':' -k2|tail -100|grep ':100:137' 
1

我編寫了一個python腳本來解析失敗作業的記帳文件。您應該將其編輯爲您自己的用途。

#!/usr/local/bin/python2.7 

import os 
from sys import * 
import sys 
import getopt 
import datetime 


#Variables 
program = "parse_acct.py" 
ifile = "/local/cluster/sge/default/common/accounting" 
failed = 0 
failedswitch = 0 
subtime = 0 
subtimeswitch = 0 
begtime = 0 
begtimeswitch = 0 
endtime = 0 
endtimeswitch = 0 
user = 0 
userswitch = 0 
node = "" 
nodeswitch = 0 

### Read command line args 
try: 
    myopts, args = getopt.getopt(sys.argv[1:],"i:f:n:t:u:b:e:h") 
except getopt.GetoptError: 
    print program + " -i <input> -u <username> -n <node_name> -f" 
    sys.exit(2) 
############################### 
# o == option 
# a == argument passed to the o 
############################### 
for o, a in myopts: 
    if o == '-f': 
     failed = a 
     failedswitch = 1 
    elif o == '-i': 
     ifile = a 
    elif o == '-u': 
     user = a 
     userswitch = 1 
    elif o == '-t': 
     subtime = a 
     subtimeswitch = 1 
    elif o == '-b': 
     begtime = a 
     begtimeswitch = 1 
    elif o == '-e': 
     endtime = a 
     endtimeswitch = 1 
    elif o == '-n': 
     node = a 
     nodeswitch = 1 
    elif o == '-h': 
     print program + " -i <input> -u <username> -n <node_name> -f" 
     sys.exit(0) 
    else: 
     print("Usage: %s -i <input> -u <username> -n <node_name> -f" % sys.argv[0]) 
     sys.exit(0) 

### --- Read line by line and import in to a list of lists --- ### 
loi = [] 
f = open(ifile, "r") 
for var in f: 
    line = var.rstrip().split(":") 
    if len(line) >= 10: 
     loi.append(line) 
     #print line 
f.close() 

### --- Parse through the list of lists and put a 0 to the beginning if it fails a test --- ### 
for i in range(len(loi)): 
     if failedswitch == 1 and loi[i][11] >= 1: #!= failed: 
      loi[i][0] = [0] 
     elif userswitch == 1 and loi[i][3] != user: 
      loi[i][0] = [0] 
     elif nodeswitch == 1 and node != loi[i][1]: 
      loi[i][0] = [0] 
#  elif nodeswitch == 1 and node not in loi[i][1]: 
#   loi[i][0] = [0] 
#  elif nodeswitch == 1 and node not in loi[i][1]: 
#   loi[i][0] = [0] 
#  elif nodeswitch == 1 and node not in loi[i][1]: 
#   loi[i][0] = [0] 
#  elif nodeswitch == 1 and node not in loi[i][1]: 
#   loi[i][0] = [0] 

### --- Remove all entries that have the "0" at the beginning --- ### 
loidedup = [x for x in loi if x[0] != [0] 

### --- Print out the files that passed all tests --- ### 
for i in range(len(loidedup)): 
    print "==============================================================" 
    print "qname  " + loidedup[i][0] 
    print "hostname  " + loidedup[i][1] 
    print "group  " + loidedup[i][2] 
    print "owner  " + loidedup[i][3] 
    print "job_name  " + loidedup[i][4] 
    print "job_number  " + loidedup[i][5] 
    print "account  " + loidedup[i][6] 
    print "priority  " + loidedup[i][7] 
    print "submission_time  " + datetime.datetime.fromtimestamp(int(loidedup[i][8])).strftime('%Y-%m-%d %H:%M:%S') 
    print "start_time  " + datetime.datetime.fromtimestamp(int(loidedup[i][9])).strftime('%Y-%m-%d %H:%M:%S') 
    print "end_time  " + datetime.datetime.fromtimestamp(int(loidedup[i][10])).strftime('%Y-%m-%d %H:%M:%S') 
    print "failed  " + loidedup[i][11] 
    print "exit_status  " + loidedup[i][12] 
    print "ru_wallclock  " + loidedup[i][13] 
    print " ru_utime  " + loidedup[i][14] 
    print " ru_stime  " + loidedup[i][15] 
    print " ru_maxrss  " + loidedup[i][16] 
    print " ru_ixrss  " + loidedup[i][17] 
    print " ru_ismrss  " + loidedup[i][18] 
    print " ru_idrss  " + loidedup[i][19] 
    print " ru_isrss  " + loidedup[i][20] 
    print " ru_minflt  " + loidedup[i][21] 
    print " ru_majflt  " + loidedup[i][22] 
    print " ru_nswap  " + loidedup[i][23] 
    print " ru_inblock  " + loidedup[i][24] 
    print " ru_oublock  " + loidedup[i][25] 
    print " ru_msgsnd  " + loidedup[i][26] 
    print " ru_msgrcv  " + loidedup[i][27] 
    print " ru_nsignals  " + loidedup[i][28] 
    print " ru_nvcsw  " + loidedup[i][29] 
    print " ru_nivcsw  " + loidedup[i][30] 
    print "project  " + loidedup[i][31] 
    print "department  " + loidedup[i][32] 
    print "granted_pe  " + loidedup[i][33] 
    print "slots  " + loidedup[i][34] 
    print "task_number  " + loidedup[i][35] 
    print "cpu  " + loidedup[i][36] 
    print "mem  " + loidedup[i][37] 
    print "io  " + loidedup[i][38] 
    print "category  " + loidedup[i][39] 
    print "iow  " + loidedup[i][40] 
    print "pe_taskid  " + loidedup[i][41] 
    print "maxvmem  " + loidedup[i][42] 
    print "arid  " + loidedup[i][43] 
    print "ar_submission_time  " + loidedup[i][44] 

# print loidedup[i]