什麼是忽略字符串中的大小寫字母,標點符號和空白字符的最有效方法?這些字符串應該被分成單詞而不是字符,應該忽略前面提到的比較細節,並且這些字串的切片應該儘可能高效地記住速度。忽略字符串中的大小寫,標點符號和空白字符
我打算在下面的代碼中使用大小寫不敏感的字符串,但看到評估class Slice: def __eq__(self, other): return self.root == other.root
需要多長時間後,我決定改用data = tuple(string.split())
。具有對大小寫不敏感的字符串,標點符號和間距以及對單詞而不是字符的處理對於在下面的代碼中已經表達的計算上昂貴的算法來說太昂貴了。
class Slice:
def __init__(self, data, offset, length):
self.prefix = data[:offset]
self.root = data[offset:offset+length]
self.suffix = data[offset+length:]
def __eq__(self, other):
return self.root == other.root
def __len__(self):
return len(self.root)
################################################################################
class Match:
def __init__(self, data, key, prefix_tree, suffix_tree):
self.data = data
self.key = key
self.prefix_tree = prefix_tree
self.suffix_tree = suffix_tree
self.__value = len(key) + prefix_tree.value() + suffix_tree.value()
def value(self):
return self.__value
################################################################################
class Tree(tuple):
def __new__(cls, nodes):
tree = super().__new__(cls, nodes)
tree.__value = max(map(Match.value, tree)) if tree else 0
return tree
def value(self):
return self.__value
def find(self, value):
for index, match in enumerate(self):
if match.value() == value:
return index
raise ValueError()
################################################################################
def search(data, key):
length = 0
nodes = []
for d_block in shrink(data, len(key)):
block_len = len(d_block)
if length > block_len:
return Tree(nodes)
for k_block in slide(key, block_len):
if d_block == k_block:
length = block_len
prefix_tree = search(d_block.prefix, k_block.prefix)
suffix_tree = search(d_block.suffix, k_block.suffix)
match = Match(d_block, k_block, prefix_tree, suffix_tree)
nodes.append(match)
return Tree(nodes)
def shrink(data, max_len):
for length in range(min(len(data), max_len), 0, -1):
for block in slide(data, length):
yield block
def slide(data, length):
for offset in range(len(data) - length + 1):
yield Slice(data, offset, length)
################################################################################
def build_tree(nodes):
match = nodes[nodes.find(nodes.value())]
node = match.key
if match.prefix_tree:
node.prefix = build_tree(match.prefix_tree)
if match.suffix_tree:
node.suffix = build_tree(match.suffix_tree)
return node
def flatten_tree(node):
array = [0]
_flatten(node, array)
return tuple(array)
def _flatten(node, array):
if isinstance(node.prefix, Slice):
_flatten(node.prefix, array)
else:
array.append(node.prefix)
array[0] += 1
array.append((array[0], node.root))
if isinstance(node.suffix, Slice):
_flatten(node.suffix, array)
else:
array.append(node.suffix)
如果有一些上下文/文檔是關於什麼類用於... – 2010-01-30 19:54:30
請不要使用'__'作爲你的「私人」變量名稱。請使用'_':這是更常見的,並且減少我們的大腦閱讀代碼的壓力。 – 2010-01-30 20:25:03
我有點困惑;什麼是問題? – mithrandi 2010-01-31 03:15:55