regex是不可能的,obviously。你看過pyparsing?
[編輯]
OTOH這可能工作:
from functools import wraps
def transition(method):
@wraps(method)
def trans(state, *args, **kwargs):
command = method(state, *args, **kwargs)
state.__class__ = command(state)
return trans
class State(object):
def __new__(cls):
state = object.__new__(cls)
state._identities = []
return state
def unchanged(state):
return state.__class__
def shifting(identity):
def command(state):
return identity
return command
def pushing(identity, afterwards=None):
def command(state):
state._identities.append(afterwards or state.__class__)
return identity
return command
def popped(state):
return state._identities.pop()
##############################################################################
import re
tokenize = re.compile(flags=re.VERBOSE | re.MULTILINE, pattern=r"""
(?P<word> \w+) |
(?P<braceleft> { ) |
(?P<braceright> } ) |
(?P<eoi> $ ) |
(?P<error> \S ) # catch all (except white space)
""").finditer
def parse(parser, source, builder):
for each in tokenize(source):
dispatch = getattr(parser, each.lastgroup)
dispatch(each.group(), builder)
class ParsingState(State):
def eoi(self, token, *args):
raise ValueError('premature end of input in parsing state %s' %
self.__class__.__name__
)
def error(self, token, *args):
raise ValueError('parsing state %s does not understand token %s' % (
self.__class__.__name__, token
))
def __getattr__(self, name):
def raiser(token, *args):
raise ValueError(
'parsing state %s does not understand token "%s" of type %s' %
(self.__class__.__name__, token, name)
)
return raiser
class Id(ParsingState):
@transition
def word(self, token, builder):
builder.add_id(token)
return shifting(BeginContent)
@transition
def eoi(self, token, builder):
return shifting(DoneParsing)
class BeginContent(ParsingState):
@transition
def braceleft(self, token, builder):
return shifting(Content)
class Content(ParsingState):
@transition
def word(self, token, builder):
builder.add_text(token)
return unchanged
@transition
def braceleft(self, token, builder):
builder.add_text(token)
return pushing(PushedContent)
@transition
def braceright(self, token, builder):
return shifting(Id)
class PushedContent(Content):
@transition
def braceright(self, token, builder):
builder.add_text(token)
return popped
class DoneParsing(ParsingState):
pass
##############################################################################
class Entry(object):
def __init__(self, idname):
self.idname = idname
self.text = []
def __str__(self):
return '%s { %s }' % (self.idname, ' '.join(self.text))
class Builder(object):
def __init__(self):
self.entries = []
def add_id(self, id_token):
self.entries.append(Entry(id_token))
def add_text(self, text_token):
self.entries[-1].text.append(text_token)
##############################################################################
if __name__ == '__main__':
file_content = """
id1 { some text } id2 {
some { text }
}
"""
builder = Builder()
parse(Id(), file_content, builder)
for entry in builder.entries:
print entry
您無論如何可以格式化該文件,我的意思是改變數據是如何寫在這個文件,因爲這個文件看起來像一個混亂!我認爲在考慮如何檢索數據之前,應該先考慮如何編寫數據以使檢索更容易。 – mouad 2010-10-27 01:21:07
「某些文字」是否包含「crlf」?如果不是的話,將它們去掉,事情變得容易很多...... – 2010-10-27 01:27:21