一些正則表達式語法泛化爲泛型序列。另外,爲了能夠指定任何對象,字符串不是表達式本身的最佳媒介。
在蟒 「小」 的例子:
def choice(*items):
return ('choice',[value(item) for item in items])
def seq(*items):
return ('seq',[value(item) for item in items])
def repeat(min,max,lazy,item):
return ('repeat',min,max,lazy,value(item))
def value(item):
if not isinstance(item,tuple):
return ('value',item)
return item
def compile(pattern):
ret = []
key = pattern[0]
if key == 'value':
ret.append(('literal',pattern[1]))
elif key == 'seq':
for subpattern in pattern[1]:
ret.extend(compile(subpattern))
elif key == 'choice':
jumps = []
n = len(pattern[1])
for i,subpattern in enumerate(pattern[1]):
if i < n-1:
pos = len(ret)
ret.append('placeholder for choice')
ret.extend(compile(subpattern))
jumps.append(len(ret))
ret.append('placeholder for jump')
ret[pos] = ('choice',len(ret)-pos)
else:
ret.extend(compile(subpattern))
for pos in jumps:
ret[pos] = ('jump', len(ret)-pos)
elif key == 'repeat':
min,max,lazy,subpattern = pattern[1:]
for _ in xrange(min):
ret.extend(compile(subpattern))
if max == -1:
if lazy:
pos = len(ret)
ret.append('placeholder for jump')
ret.extend(compile(subpattern))
ret[pos] = ('jump',len(ret)-pos)
ret.append(('choice',pos+1-len(ret)))
else:
pos = len(ret)
ret.append('placeholder for choice')
ret.extend(compile(subpattern))
ret.append(('jump',pos-len(ret)))
ret[pos] = ('choice',len(ret)-pos)
elif max > min:
if lazy:
jumps = []
for _ in xrange(min,max):
ret.append(('choice',2))
jumps.append(len(ret))
ret.append('placeholder for jump')
ret.extend(compile(subpattern))
for pos in jumps:
ret[pos] = ('jump', len(ret)-pos)
else:
choices = []
for _ in xrange(min,max):
choices.append(len(ret))
ret.append('placeholder for choice')
ret.extend(compile(subpattern))
ret.append(('drop,'))
for pos in choices:
ret[pos] = ('choice',len(ret)-pos)
return ret
def match(pattern,subject,start=0):
stack = []
pos = start
i = 0
while i < len(pattern):
instruction = pattern[i]
key = instruction[0]
if key == 'literal':
if pos < len(subject) and subject[pos] == instruction[1]:
i += 1
pos += 1
continue
elif key == 'jump':
i += instruction[1]
continue
elif key == 'choice':
stack.append((i+instruction[1],pos))
i += 1
continue
# fail
if not stack:
return None
i,pos = stack.pop()
return pos
def find(pattern,subject,start=0):
for pos1 in xrange(start,len(subject)+1):
pos2 = match(pattern,subject,pos1)
if pos2 is not None: return pos1,pos2
return None,None
def find_all(pattern,subject,start=0):
matches = []
pos1,pos2 = find(pattern,subject,start)
while pos1 is not None:
matches.append((pos1,pos2))
pos1,pos2 = find(pattern,subject,pos2)
return matches
# Timestamps: ([01][0-9]|2[0-3])[0-5][0-9]
pattern = compile(
seq(
choice(
seq(choice(0,1),choice(0,1,2,3,4,5,6,7,8,9)),
seq(2,choice(0,1,2,3)),
),
choice(0,1,2,3,4,5),
choice(0,1,2,3,4,5,6,7,8,9),
)
)
print find_all(pattern,[1,3,2,5,6,3,4,2,4,3,2,2,3,6,6,5,3,5,3,3,2,5,4,5])
# matches: (0,4): [1,3,2,5]; (10,14): [2,2,3,6]
改進的幾點:
你能發表一些例子嗎? – 2010-11-29 11:32:07
對於有趣的問題+1。我看了一下,很失望地看到Scala'a正則表達式只能用於Strings--標準庫通常是相當普遍的。我期待着一些答案 – 2010-11-29 11:37:20
整數有多大?如果是21位或更低,則可以將其轉換爲Unicode代碼點。 – leppie 2010-11-29 12:05:15