2016-12-14 76 views
1

如何使用pyparsing模塊解析具有多種格式日誌的日誌文件。以下是我正在使用的代碼。使用PyParsing進行系統日誌解析

# -*- coding: utf-8 -*- 
""" 

""" 

import pandas as pd 

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex 

from time import strftime 

class Parser(object): 
    def __init__(self): 
    ints = Word(nums) 

    # priority 
    # priority = Suppress("<") + ints + Suppress(">") 

    # timestamp 
    month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 
    day = ints 
    hour = Combine(ints + ":" + ints + ":" + ints) 

    timestamp = month + day + hour 

    # hostname 
    hostname = Word(alphas + nums + "_" + "-" + ".") 

    # appname 
    appname = Word(alphas + "/" + "-" + "_" + "." + "(" + ")") + (Suppress("[") + ints + Suppress("]")) | (Word(alphas + "/" + "-" + "_" + ".") + Word (":")) 

    # message 
    message = Regex(".*") 

    # pattern build 
    self.__pattern = timestamp + hostname + appname + message 


    def parse(self, line): 
    parsed = self.__pattern.parseString(line) 

    payload    = {} 
    #payload["priority"] = parsed[0] 
    payload["timestamp"] = strftime("%Y-%m-%d %H:%M:%S") 
    payload["hostname"] = parsed[3] 
    payload["appname"] = parsed[4] 
    payload["pid"]  = parsed[5] 
    payload["message"] = parsed[6] 


    return payload 


def main(): 

    parser = Parser() 

    with open('./messages.log') as syslogFile: 

     list1 = [] 
     for line in syslogFile: 
      fields = parser.parse(line) 
      list1.append(fields) 

     return list1 


if __name__ == "__main__": 

    main() 

以下是不同的不同的日誌的樣品需要分析:

Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK. 
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246 
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe 
Mar 7 21:23:22 avas dccifd[6191]: missing message body 
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53 
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure 
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT 
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err` 
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2 
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577 
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567) 
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window 
Mar 8 16:05:26 avas arpwatch: listening on eth0 
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53 
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX 
Mar 8 15:18:40 avas: last message repeated 11 times 

請建議我該怎麼辦?

+0

你的意思是這樣的嗎? https://gist.github.com/leandrosilva/3651640 – nir0s

+0

是這樣的,但因爲在我的日誌文件日誌格式是不一樣的總是。我收到一個錯誤:列出超出索引,同時解析下面的行。 「3月8日15:18:40 avas:最後一條消息重複了11次」 – RRK

+0

你可以隨時嘗試,除了IndexError – nir0s

回答

0

爲了處理這一新行,我使用pyparsing Optional類將appname部分標記爲可選項,並將尾部的':'分開。在下面的代碼中,我還做了一些調整,解析時數據轉換的一些解析操作,以及一些結果名稱,以簡化在parse()方法中創建結果字典。

from pyparsing import Word, alphas, Suppress, Combine, nums, string, Regex, Optional 

from datetime import datetime 

class Parser(object): 
    # log lines don't include the year, but if we don't provide one, datetime.strptime will assume 1900 
    ASSUMED_YEAR = '2016' 

    def __init__(self): 
     ints = Word(nums) 

     # priority 
     # priority = Suppress("<") + ints + Suppress(">") 

     # timestamp 
     month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 
     day = ints 
     hour = Combine(ints + ":" + ints + ":" + ints) 

     timestamp = month + day + hour 
     # a parse action will convert this timestamp to a datetime 
     timestamp.setParseAction(lambda t: datetime.strptime(Parser.ASSUMED_YEAR + ' ' + ' '.join(t), '%Y %b %d %H:%M:%S')) 

     # hostname 
     hostname = Word(alphas + nums + "_-.") 

     # appname 
     appname = Word(alphas + "/-_.()")("appname") + (Suppress("[") + ints("pid") + Suppress("]")) | (Word(alphas + "/-_.")("appname")) 
     appname.setName("appname") 

     # message 
     message = Regex(".*") 

     # pattern build 
     # (add results names to make it easier to access parsed fields) 
     self._pattern = timestamp("timestamp") + hostname("hostname") + Optional(appname) + Suppress(':') + message("message") 

    def parse(self, line): 
     parsed = self._pattern.parseString(line) 
     # fill in keys that might not have been found in the input string 
     # (this could have been done in a parse action too, then this method would 
     # have just been a two-liner) 
     for key in 'appname pid'.split(): 
      if key not in parsed: 
       parsed[key] = '' 
     return parsed.asDict() 

使用runTests()來測試你的解析器針對特定測試輸入:

pattern = Parser()._pattern 

tests = """\ 
Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK. 
Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246 
Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe 
Mar 7 21:23:22 avas dccifd[6191]: missing message body 
Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53 
Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure 
Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT 
Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err` 
Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2 
Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577 
Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567) 
Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window 
Mar 8 16:05:26 avas arpwatch: listening on eth0 
Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53 
Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX 
Mar 8 15:18:40 avas: last message repeated 11 times""" 

pattern.runTests(tests) 

給出:

Mar 7 04:02:16 avas clamd[11165]: /var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND 
[datetime.datetime(2016, 3, 7, 4, 2, 16), 'avas', 'clamd', '11165', '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: Worm.Mydoom.F FOUND' 
- pid: '11165' 
- timestamp: datetime.datetime(2016, 3, 7, 4, 2, 16) 


Mar 7 04:05:55 avas clamd[11240]: /var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND 
[datetime.datetime(2016, 3, 7, 4, 5, 55), 'avas', 'clamd', '11240', '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: Worm.SomeFool.Gen-1 FOUND' 
- pid: '11240' 
- timestamp: datetime.datetime(2016, 3, 7, 4, 5, 55) 


Mar 7 09:00:51 avas clamd[27173]: SelfCheck: Database status OK. 
[datetime.datetime(2016, 3, 7, 9, 0, 51), 'avas', 'clamd', '27173', 'SelfCheck: Database status OK.'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: 'SelfCheck: Database status OK.' 
- pid: '27173' 
- timestamp: datetime.datetime(2016, 3, 7, 9, 0, 51) 


Mar 7 05:59:02 avas clamd[27173]: Database correctly reloaded (20400 viruses) 
[datetime.datetime(2016, 3, 7, 5, 59, 2), 'avas', 'clamd', '27173', 'Database correctly reloaded (20400 viruses)'] 
- appname: 'clamd' 
- hostname: 'avas' 
- message: 'Database correctly reloaded (20400 viruses)' 
- pid: '27173' 
- timestamp: datetime.datetime(2016, 3, 7, 5, 59, 2) 


Mar 7 11:14:35 avas dccd[13284]: 21 requests/sec are too many from anonymous 205.201.1.56,2246 
[datetime.datetime(2016, 3, 7, 11, 14, 35), 'avas', 'dccd', '13284', '21 requests/sec are too many from anonymous 205.201.1.56,2246'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: '21 requests/sec are too many from anonymous 205.201.1.56,2246' 
- pid: '13284' 
- timestamp: datetime.datetime(2016, 3, 7, 11, 14, 35) 


Mar 8 00:22:57 avas dccifd[9933]: write(MTA socket,4): Broken pipe 
[datetime.datetime(2016, 3, 8, 0, 22, 57), 'avas', 'dccifd', '9933', 'write(MTA socket,4): Broken pipe'] 
- appname: 'dccifd' 
- hostname: 'avas' 
- message: 'write(MTA socket,4): Broken pipe' 
- pid: '9933' 
- timestamp: datetime.datetime(2016, 3, 8, 0, 22, 57) 


Mar 7 21:23:22 avas dccifd[6191]: missing message body 
[datetime.datetime(2016, 3, 7, 21, 23, 22), 'avas', 'dccifd', '6191', 'missing message body'] 
- appname: 'dccifd' 
- hostname: 'avas' 
- message: 'missing message body' 
- pid: '6191' 
- timestamp: datetime.datetime(2016, 3, 7, 21, 23, 22) 


Mar 9 16:05:17 avas named[12045]: zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53 
[datetime.datetime(2016, 3, 9, 16, 5, 17), 'avas', 'named', '12045', 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 10.0.0.253#53' 
- pid: '12045' 
- timestamp: datetime.datetime(2016, 3, 9, 16, 5, 17) 


Mar 10 00:38:16 avas dccifd[23069]: continue not asking DCC 17 seconds after failure 
[datetime.datetime(2016, 3, 10, 0, 38, 16), 'avas', 'dccifd', '23069', 'continue not asking DCC 17 seconds after failure'] 
- appname: 'dccifd' 
- hostname: 'avas' 
- message: 'continue not asking DCC 17 seconds after failure' 
- pid: '23069' 
- timestamp: datetime.datetime(2016, 3, 10, 0, 38, 16) 


Mar 10 09:42:11 avas named: client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT 
[datetime.datetime(2016, 3, 10, 9, 42, 11), 'avas', 'named', 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'client 127.0.0.1#55524: query: 23.68.27.142.sa-trusted.bondedsender.org IN TXT' 
- timestamp: datetime.datetime(2016, 3, 10, 9, 42, 11) 


Mar 9 03:48:07 avas dccd[145]: automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err` 
[datetime.datetime(2016, 3, 9, 3, 48, 7), 'avas', 'dccd', '145', 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: 'automatic dbclean; starting `dbclean -DPq -i 1189 -L info,local5.notice -L error,local5.err`' 
- pid: '145' 
- timestamp: datetime.datetime(2016, 3, 9, 3, 48, 7) 


Mar 9 11:58:18 avas kernel: i810_audio: Connection 0 with codec id 2 
[datetime.datetime(2016, 3, 9, 11, 58, 18), 'avas', 'kernel', 'i810_audio: Connection 0 with codec id 2'] 
- appname: 'kernel' 
- hostname: 'avas' 
- message: 'i810_audio: Connection 0 with codec id 2' 
- timestamp: datetime.datetime(2016, 3, 9, 11, 58, 18) 


Mar 9 19:41:13 avas dccd[3004]: "packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577 
[datetime.datetime(2016, 3, 9, 19, 41, 13), 'avas', 'dccd', '3004', '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: '"packet length 44 too small for REPORT" sent to client 1 at 194.63.250.215,47577' 
- pid: '3004' 
- timestamp: datetime.datetime(2016, 3, 9, 19, 41, 13) 


Mar 8 09:01:07 avas sshd(pam_unix)[21839]: session opened for user tom by (uid=35567) 
[datetime.datetime(2016, 3, 8, 9, 1, 7), 'avas', 'sshd(pam_unix)', '21839', 'session opened for user tom by (uid=35567)'] 
- appname: 'sshd(pam_unix)' 
- hostname: 'avas' 
- message: 'session opened for user tom by (uid=35567)' 
- pid: '21839' 
- timestamp: datetime.datetime(2016, 3, 8, 9, 1, 7) 


Mar 8 03:52:04 avas dccd[13284]: 1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window 
[datetime.datetime(2016, 3, 8, 3, 52, 4), 'avas', 'dccd', '13284', '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window'] 
- appname: 'dccd' 
- hostname: 'avas' 
- message: '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window' 
- pid: '13284' 
- timestamp: datetime.datetime(2016, 3, 8, 3, 52, 4) 


Mar 8 16:05:26 avas arpwatch: listening on eth0 
[datetime.datetime(2016, 3, 8, 16, 5, 26), 'avas', 'arpwatch', 'listening on eth0'] 
- appname: 'arpwatch' 
- hostname: 'avas' 
- message: 'listening on eth0' 
- timestamp: datetime.datetime(2016, 3, 8, 16, 5, 26) 


Mar 10 10:00:06 avas named[6986]: zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53 
[datetime.datetime(2016, 3, 10, 10, 0, 6), 'avas', 'named', '6986', 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'zone PLNet/IN: refresh: non-authoritative answer from master 192.75.26.21#53' 
- pid: '6986' 
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 6) 


Mar 10 10:00:10 avas named[6986]: client 127.0.0.1#55867: query: mail.canfor.ca IN MX 
[datetime.datetime(2016, 3, 10, 10, 0, 10), 'avas', 'named', '6986', 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX'] 
- appname: 'named' 
- hostname: 'avas' 
- message: 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX' 
- pid: '6986' 
- timestamp: datetime.datetime(2016, 3, 10, 10, 0, 10) 

Mar 8 15:18:40 avas: last message repeated 11 times 
[datetime.datetime(2016, 3, 8, 15, 18, 40), 'avas', 'last message repeated 11 times'] 
- hostname: 'avas' 
- message: 'last message repeated 11 times' 
- timestamp: datetime.datetime(2016, 3, 8, 15, 18, 40) 

或者使用Parser類的parse()方法:

from pprint import pprint 
for t in tests.splitlines(): 
    pprint(Parser().parse(t)) 
    print() 

給出:

{'appname': 'clamd', 
'hostname': 'avas', 
'message': '/var/amavis/amavis-20040307T033734-10329/parts/part-00003: ' 
     'Worm.Mydoom.F FOUND ', 
'pid': '11165', 
'timestamp': datetime.datetime(2016, 3, 7, 4, 2, 16)} 

{'appname': 'clamd', 
'hostname': 'avas', 
'message': '/var/amavis/amavis-20040307T035901-10615/parts/part-00002: ' 
     'Worm.SomeFool.Gen-1 FOUND ', 
'pid': '11240', 
'timestamp': datetime.datetime(2016, 3, 7, 4, 5, 55)} 

{'appname': 'clamd', 
'hostname': 'avas', 
'message': 'SelfCheck: Database status OK.', 
'pid': '27173', 
'timestamp': datetime.datetime(2016, 3, 7, 9, 0, 51)} 

{'appname': 'clamd', 
'hostname': 'avas', 
'message': 'Database correctly reloaded (20400 viruses) ', 
'pid': '27173', 
'timestamp': datetime.datetime(2016, 3, 7, 5, 59, 2)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': '21 requests/sec are too many from anonymous 205.201.1.56,2246', 
'pid': '13284', 
'timestamp': datetime.datetime(2016, 3, 7, 11, 14, 35)} 

{'appname': 'dccifd', 
'hostname': 'avas', 
'message': 'write(MTA socket,4): Broken pipe', 
'pid': '9933', 
'timestamp': datetime.datetime(2016, 3, 8, 0, 22, 57)} 

{'appname': 'dccifd', 
'hostname': 'avas', 
'message': 'missing message body', 
'pid': '6191', 
'timestamp': datetime.datetime(2016, 3, 7, 21, 23, 22)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master ' 
     '10.0.0.253#53', 
'pid': '12045', 
'timestamp': datetime.datetime(2016, 3, 9, 16, 5, 17)} 

{'appname': 'dccifd', 
'hostname': 'avas', 
'message': 'continue not asking DCC 17 seconds after failure', 
'pid': '23069', 
'timestamp': datetime.datetime(2016, 3, 10, 0, 38, 16)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'client 127.0.0.1#55524: query: ' 
     '23.68.27.142.sa-trusted.bondedsender.org IN TXT', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 10, 9, 42, 11)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': 'automatic dbclean; starting `dbclean -DPq -i 1189 -L ' 
     'info,local5.notice -L error,local5.err`', 
'pid': '145', 
'timestamp': datetime.datetime(2016, 3, 9, 3, 48, 7)} 

{'appname': 'kernel', 
'hostname': 'avas', 
'message': 'i810_audio: Connection 0 with codec id 2', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 9, 11, 58, 18)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': '"packet length 44 too small for REPORT" sent to client 1 at ' 
     '194.63.250.215,47577', 
'pid': '3004', 
'timestamp': datetime.datetime(2016, 3, 9, 19, 41, 13)} 

{'appname': 'sshd(pam_unix)', 
'hostname': 'avas', 
'message': 'session opened for user tom by (uid=35567)', 
'pid': '21839', 
'timestamp': datetime.datetime(2016, 3, 8, 9, 1, 7)} 

{'appname': 'dccd', 
'hostname': 'avas', 
'message': '1.2.32 database /home/dcc/dcc_db reopened with 997 MByte window', 
'pid': '13284', 
'timestamp': datetime.datetime(2016, 3, 8, 3, 52, 4)} 

{'appname': 'arpwatch', 
'hostname': 'avas', 
'message': 'listening on eth0', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 8, 16, 5, 26)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'zone PLNet/IN: refresh: non-authoritative answer from master ' 
     '192.75.26.21#53', 
'pid': '6986', 
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 6)} 

{'appname': 'named', 
'hostname': 'avas', 
'message': 'client 127.0.0.1#55867: query: mail.canfor.ca IN MX', 
'pid': '6986', 
'timestamp': datetime.datetime(2016, 3, 10, 10, 0, 10)} 

{'appname': '', 
'hostname': 'avas', 
'message': 'last message repeated 11 times', 
'pid': '', 
'timestamp': datetime.datetime(2016, 3, 8, 15, 18, 40)} 
+0

它工作得很好。謝謝@Paul McGuire :) – RRK