嘗試使用以下文件(designation.rules.txt)
ENV.defaultStringPatternFlags = 2
ner = { type: "CLASS", value: "edu.stanford.nlp.ling.CoreAnnotations$NamedEntityTagAnnotation" }
tokens = { type: "CLASS", value: "edu.stanford.nlp.ling.CoreAnnotations$TokensAnnotation" }
$Designation = (
/CFO/|
/Director/|
/CEO/|
/Chief/|
/Executive/|
/Officer/|
/Vice/|
/President/|
/Senior/|
/Financial/
)
ENV.defaults["ruleType"] = "tokens"
ENV.defaults["stage"] = 1
{
pattern: ($Designation),
action: (Annotate($0, ner, "DESIGNATION"))
}
ENV.defaults["stage"] = 2
{
ruleType: "tokens",
pattern: (([ { ner:PERSON } ]) /has/ ([ { ner:DESIGNATION } ]+)),
result: Format("hasDesignation(%s,%s)",$1.word, Join(" ",$2.word))
}
,並使用下面的Java文件生成
package org.itcookies.nlpdemo;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;
import java.util.Properties;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
/**
* Demo illustrating how to use TokensRegexAnnotator
*/
public class TokensRegexAnnotatorDemo {
public static void main(String[] args) throws IOException {
PrintWriter out;
String rules;
if (args.length > 0) {
rules = args[0];
} else {
rules = "org/itcookies/nlp/rules/designation.rules.txt";
}
if (args.length > 2) {
out = new PrintWriter(args[2]);
} else {
out = new PrintWriter(System.out);
}
Properties properties = new Properties();
properties.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,tokensregexdemo");
properties.setProperty("customAnnotatorClass.tokensregexdemo", "edu.stanford.nlp.pipeline.TokensRegexAnnotator");
properties.setProperty("tokensregexdemo.rules", rules);
StanfordCoreNLP pipeline = new StanfordCoreNLP(properties);
Annotation annotation;
if (args.length > 1) {
annotation = new Annotation(IOUtils.slurpFileNoExceptions(args[1]));
} else {
annotation = new Annotation("John is CEO of ITCookies");
}
pipeline.annotate(annotation);
// An Annotation is a Map and you can get and use the various analyses individually.
out.println();
// The toString() method on an Annotation just prints the text of the Annotation
// But you can see what is in it with other methods like toShorterString()
out.println("The top level annotation");
out.println(annotation.toShorterString());
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
// NOTE: Depending on what tokensregex rules are specified, there are other annotations
// that are of interest other than just the tokens and what we print out here
for (CoreLabel token:sentence.get(CoreAnnotations.TokensAnnotation.class)) {
// Print out words, lemma, ne, and normalized ne
String word = token.get(CoreAnnotations.TextAnnotation.class);
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
String normalized = token.get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
if(ne.equals("DESIGNATION"))
out.println("token: " + "word="+word + ", lemma="+lemma + ", pos=" + pos + ", ne=" + ne + ", normalized=" + normalized);
}
}
out.flush();
}
}
及以下的輸出
The top level annotation
[Text=John is CEO of ITCookies Tokens=[John-1, is-2, CEO-3, of-4, ITCookies-5] Sentences=[John is CEO of ITCookies]]
token: word=CEO, lemma=CEO, pos=NNP, ne=DESIGNATION, normalized=null
你需要訓練你自己的NER模型,在你的訓練集中引入你自己的標籤'DESIGNATION'。看看他們的文檔。 http://nlp.stanford.edu/software/crf-faq.shtml#a – meghamind
培訓斯坦福分析器'指定'你需要很多的訓練數據,你必須收集更大的數據,因爲少量的數據可能不會給你正確的數據 –