此代碼應工作:
public static class StemmingUtilities
{
private class Node
{
public char? Value { get; private set; }
public Node Parent { get; private set; }
public List<Node> Children { get; private set; }
public Node(char? c, Node parent)
{
this.Value = c;
this.Parent = parent;
this.Children = new List<Node>();
}
}
public static string GetRegex(IEnumerable<string> tokens)
{
var root = new Node(null,null);
foreach (var token in tokens)
{
var current = root;
for (int i = 0; i < token.Length; i++)
{
char c = token[i];
var node = current.Children.FirstOrDefault(x => x.Value.Value == c);
if (node == null)
{
node = new Node(c,current);
current.Children.Add(node);
}
current = node;
}
}
return BuildRexp(root);
}
private static string BuildRexp(Node root)
{
string s = "";
bool addBracket = root.Children.Count > 1;
// uncomment the following line to avoid first brakets wrapping (occurring in case of multiple root's children)
// addBracket = addBracket && (root.Parent != null);
if (addBracket)
s += "(";
for(int i = 0; i < root.Children.Count; i++)
{
var child = root.Children[i];
s += child.Value;
s += BuildRexp(child);
if (i < root.Children.Count - 1)
s += "|";
}
if (addBracket)
s += ")";
return s;
}
}
用法:
var toStem1 = new[] { "what", "why", "where", "when", "which" };
string reg1 = StemmingUtilities.GetRegex(toStem1);
// reg1 = "wh(at|y|e(re|n)|ich)"
string[] toStem2 = new[] { "why", "abc", "what", "where", "apple", "when" };
string reg2 = StemmingUtilities.GetRegex(toStem2);
// reg2 = "(wh(y|at|e(re|n))|a(bc|pple))"
編輯:
得到reg2 = "wh(y|at|e(re|n))|a(bc|pple)"
即沒有第一包裹支架,只需取消對標線BuildRexp
方法。
IIRC這裏有一個* Perl *包。然後你只需要將它翻譯成C#... – kennytm 2010-11-14 10:39:34
我不確定是否存在這樣做的庫,但有一種方法是將單詞加載到一個trie中,然後根據需要進行移動以生成正則表達式。 http://en.wikipedia.org/wiki/Trie – Ani 2010-11-14 10:55:44