爲了處理這一新行,我使用pyparsing Optional類將appname部分標記爲可選項,並將尾部的':'分開。在下面的代碼中,我還做了一些調整,解析時數據轉換的一些解析操作,以及一些結果名稱,以簡化在parse()方法中創建結果字典。
from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex, Optional
from datetime import datetime
class Parser(object):
# log lines don't include the year, but if we don't provide one, datetime.strptime will assume 1900
ASSUMED_YEAR = '2016'
def __init__(self):
ints = Word(nums)
# priority
# priority = Suppress("<") + ints + Suppress(">")
# timestamp
month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
day = ints
hour = Combine(ints + ":" + ints + ":" + ints)
timestamp = month + day + hour
# a parse action will convert this timestamp to a datetime
timestamp.setParseAction(lambda t: datetime.strptime(Parser.ASSUMED_YEAR + ' ' + ' '.join(t), '%Y %b %d %H:%M:%S'))
# hostname
hostname = Word(alphas + nums + "_-.")
# appname
appname = Word(alphas + "/-_.()")("appname") + (Suppress("[") + ints("pid") + Suppress("]")) | (Word(alphas + "/-_.")("appname"))
appname.setName("appname")
# message
message = Regex(".*")
# pattern build
# (add results names to make it easier to access parsed fields)
self._pattern = timestamp("timestamp") + hostname("hostname") + Optional(appname) + Suppress(':') + message("message")
def parse(self, line):
parsed = self._pattern.parseString(line)
# fill in keys that might not have been found in the input string
# (this could have been done in a parse action too, then this method would
# have just been a two-liner)
for key in 'appname pid'.split():
if key not in parsed:
parsed[key] = ''
return parsed.asDict()
使用runTests()來測試你的解析器針對特定測試輸入:
pattern = Parser()._pattern
tests = """\
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
Mar 7 21:23:22 avas dccifd[6191]: missing message body
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
Mar 8 16:05:26 avas arpwatch: listening on eth0
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
Mar 8 15:18:40 avas: last message repeated 11 times"""
pattern.runTests(tests)
給出:
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND
[datetime.datetime(2016, 3, 7, 4, 2, 16), 'avas', 'clamd', '11165', '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND'
- pid: '11165'
- timestamp: datetime.datetime(2016, 3, 7, 4, 2, 16)
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND
[datetime.datetime(2016, 3, 7, 4, 5, 55), 'avas', 'clamd', '11240', '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND']
- appname: 'clamd'
- hostname: 'avas'
- message: '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND'
- pid: '11240'
- timestamp: datetime.datetime(2016, 3, 7, 4, 5, 55)
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK.
[datetime.datetime(2016, 3, 7, 9, 0, 51), 'avas', 'clamd', '27173', 'SelfCheck: Database status OK.']
- appname: 'clamd'
- hostname: 'avas'
- message: 'SelfCheck: Database status OK.'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 9, 0, 51)
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses)
[datetime.datetime(2016, 3, 7, 5, 59, 2), 'avas', 'clamd', '27173', 'Database correctly reloaded (20400 viruses)']
- appname: 'clamd'
- hostname: 'avas'
- message: 'Database correctly reloaded (20400 viruses)'
- pid: '27173'
- timestamp: datetime.datetime(2016, 3, 7, 5, 59, 2)
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246
[datetime.datetime(2016, 3, 7, 11, 14, 35), 'avas', 'dccd', '13284', '21 requests/sec are too many from anonymous 205.201.1.56,2246']
- appname: 'dccd'
- hostname: 'avas'
- message: '21 requests/sec are too many from anonymous 205.201.1.56,2246'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 7, 11, 14, 35)
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe
[datetime.datetime(2016, 3, 8, 0, 22, 57), 'avas', 'dccifd', '9933', 'write(MTA socket,4): Broken pipe']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'write(MTA socket,4): Broken pipe'
- pid: '9933'
- timestamp: datetime.datetime(2016, 3, 8, 0, 22, 57)
Mar 7 21:23:22 avas dccifd[6191]: missing message body
[datetime.datetime(2016, 3, 7, 21, 23, 22), 'avas', 'dccifd', '6191', 'missing message body']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'missing message body'
- pid: '6191'
- timestamp: datetime.datetime(2016, 3, 7, 21, 23, 22)
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53
[datetime.datetime(2016, 3, 9, 16, 5, 17), 'avas', 'named', '12045', 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53'
- pid: '12045'
- timestamp: datetime.datetime(2016, 3, 9, 16, 5, 17)
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure
[datetime.datetime(2016, 3, 10, 0, 38, 16), 'avas', 'dccifd', '23069', 'continue not asking DCC 17 seconds after failure']
- appname: 'dccifd'
- hostname: 'avas'
- message: 'continue not asking DCC 17 seconds after failure'
- pid: '23069'
- timestamp: datetime.datetime(2016, 3, 10, 0, 38, 16)
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT
[datetime.datetime(2016, 3, 10, 9, 42, 11), 'avas', 'named', 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT'
- timestamp: datetime.datetime(2016, 3, 10, 9, 42, 11)
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`
[datetime.datetime(2016, 3, 9, 3, 48, 7), 'avas', 'dccd', '145', 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`']
- appname: 'dccd'
- hostname: 'avas'
- message: 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`'
- pid: '145'
- timestamp: datetime.datetime(2016, 3, 9, 3, 48, 7)
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2
[datetime.datetime(2016, 3, 9, 11, 58, 18), 'avas', 'kernel', 'i810_audio: Connection 0 with codec id 2']
- appname: 'kernel'
- hostname: 'avas'
- message: 'i810_audio: Connection 0 with codec id 2'
- timestamp: datetime.datetime(2016, 3, 9, 11, 58, 18)
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577
[datetime.datetime(2016, 3, 9, 19, 41, 13), 'avas', 'dccd', '3004', '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577']
- appname: 'dccd'
- hostname: 'avas'
- message: '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577'
- pid: '3004'
- timestamp: datetime.datetime(2016, 3, 9, 19, 41, 13)
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567)
[datetime.datetime(2016, 3, 8, 9, 1, 7), 'avas', 'sshd(pam_unix)', '21839', 'session opened for user tom by (uid=35567)']
- appname: 'sshd(pam_unix)'
- hostname: 'avas'
- message: 'session opened for user tom by (uid=35567)'
- pid: '21839'
- timestamp: datetime.datetime(2016, 3, 8, 9, 1, 7)
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window
[datetime.datetime(2016, 3, 8, 3, 52, 4), 'avas', 'dccd', '13284', '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window']
- appname: 'dccd'
- hostname: 'avas'
- message: '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window'
- pid: '13284'
- timestamp: datetime.datetime(2016, 3, 8, 3, 52, 4)
Mar 8 16:05:26 avas arpwatch: listening on eth0
[datetime.datetime(2016, 3, 8, 16, 5, 26), 'avas', 'arpwatch', 'listening on eth0']
- appname: 'arpwatch'
- hostname: 'avas'
- message: 'listening on eth0'
- timestamp: datetime.datetime(2016, 3, 8, 16, 5, 26)
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53
[datetime.datetime(2016, 3, 10, 10, 0, 6), 'avas', 'named', '6986', 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53']
- appname: 'named'
- hostname: 'avas'
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 6)
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX
[datetime.datetime(2016, 3, 10, 10, 0, 10), 'avas', 'named', '6986', 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX']
- appname: 'named'
- hostname: 'avas'
- message: 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX'
- pid: '6986'
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 10)
Mar 8 15:18:40 avas: last message repeated 11 times
[datetime.datetime(2016, 3, 8, 15, 18, 40), 'avas', 'last message repeated 11 times']
- hostname: 'avas'
- message: 'last message repeated 11 times'
- timestamp: datetime.datetime(2016, 3, 8, 15, 18, 40)
或者使用Parser類的parse()方法:
from pprint import pprint
for t in tests.splitlines():
pprint(Parser().parse(t))
print()
給出:
{'appname': 'clamd',
'hostname': 'avas',
'message': '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: '
'Worm.Mydoom.F FOUND ',
'pid': '11165',
'timestamp': datetime.datetime(2016, 3, 7, 4, 2, 16)}
{'appname': 'clamd',
'hostname': 'avas',
'message': '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: '
'Worm.SomeFool.Gen-1 FOUND ',
'pid': '11240',
'timestamp': datetime.datetime(2016, 3, 7, 4, 5, 55)}
{'appname': 'clamd',
'hostname': 'avas',
'message': 'SelfCheck: Database status OK.',
'pid': '27173',
'timestamp': datetime.datetime(2016, 3, 7, 9, 0, 51)}
{'appname': 'clamd',
'hostname': 'avas',
'message': 'Database correctly reloaded (20400 viruses) ',
'pid': '27173',
'timestamp': datetime.datetime(2016, 3, 7, 5, 59, 2)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '21 requests/sec are too many from anonymous 205.201.1.56,2246',
'pid': '13284',
'timestamp': datetime.datetime(2016, 3, 7, 11, 14, 35)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'write(MTA socket,4): Broken pipe',
'pid': '9933',
'timestamp': datetime.datetime(2016, 3, 8, 0, 22, 57)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'missing message body',
'pid': '6191',
'timestamp': datetime.datetime(2016, 3, 7, 21, 23, 22)}
{'appname': 'named',
'hostname': 'avas',
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
'10.0.0.253#53',
'pid': '12045',
'timestamp': datetime.datetime(2016, 3, 9, 16, 5, 17)}
{'appname': 'dccifd',
'hostname': 'avas',
'message': 'continue not asking DCC 17 seconds after failure',
'pid': '23069',
'timestamp': datetime.datetime(2016, 3, 10, 0, 38, 16)}
{'appname': 'named',
'hostname': 'avas',
'message': 'client 127.0.0.1#55524: query: '
'23.68.27.142.sa-trusted.bondedsender.org IN TXT',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 10, 9, 42, 11)}
{'appname': 'dccd',
'hostname': 'avas',
'message': 'automatic dbclean; starting `dbclean -DPq -i 1189 -L '
'info,local5.notice -L error,local5.err`',
'pid': '145',
'timestamp': datetime.datetime(2016, 3, 9, 3, 48, 7)}
{'appname': 'kernel',
'hostname': 'avas',
'message': 'i810_audio: Connection 0 with codec id 2',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 9, 11, 58, 18)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '"packet length 44 too small for REPORT" sent to client 1 at '
'194.63.250.215,47577',
'pid': '3004',
'timestamp': datetime.datetime(2016, 3, 9, 19, 41, 13)}
{'appname': 'sshd(pam_unix)',
'hostname': 'avas',
'message': 'session opened for user tom by (uid=35567)',
'pid': '21839',
'timestamp': datetime.datetime(2016, 3, 8, 9, 1, 7)}
{'appname': 'dccd',
'hostname': 'avas',
'message': '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window',
'pid': '13284',
'timestamp': datetime.datetime(2016, 3, 8, 3, 52, 4)}
{'appname': 'arpwatch',
'hostname': 'avas',
'message': 'listening on eth0',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 8, 16, 5, 26)}
{'appname': 'named',
'hostname': 'avas',
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master '
'192.75.26.21#53',
'pid': '6986',
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 6)}
{'appname': 'named',
'hostname': 'avas',
'message': 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX',
'pid': '6986',
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 10)}
{'appname': '',
'hostname': 'avas',
'message': 'last message repeated 11 times',
'pid': '',
'timestamp': datetime.datetime(2016, 3, 8, 15, 18, 40)}
你的意思是這樣的嗎? https://gist.github.com/leandrosilva/3651640 – nir0s
是這樣的,但因爲在我的日誌文件日誌格式是不一樣的總是。我收到一個錯誤:列出超出索引,同時解析下面的行。 「3月8日15:18:40 avas:最後一條消息重複了11次」 – RRK
你可以隨時嘗試,除了IndexError – nir0s