2016-05-09 126 views
1

例如,斯坦福情樹庫如何將標籤添加到沒有標籤的解析樹?

"(2 (2 (2 near) (2 (2 the) (2 end))) (3 (3 (2 takes) (2 (2 on) (2 (2 a) (2 (2 whole) (2 (2 other) (2 meaning)))))) (2 .)))"解析樹,

其中的數量是每個節點的情緒的標籤。

我想添加POS標記信息到每個節點。如:

"(NP (ADJP (IN near)) (DT the) (NN end)) "

我曾嘗試直接解析這句話,但結果樹是從感情上樹庫不同的(可能是因爲解析版本或參數,我曾試圖聯繫到作者但沒有迴應)。

如何獲取標記信息?

回答

1

我認爲edu.stanford.nlp.sentiment.BuildBinarizedDataset中的代碼應該是有幫助的。 main()方法逐步瞭解如何在Java代碼中創建這些二叉樹。

一些關鍵線看出來的代碼:

LexicalizedParser parser = LexicalizedParser.loadModel(parserModel); 
TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); 
... 
Tree tree = parser.apply(tokens); 
Tree binarized = binarizer.transformTree(tree); 

您可以從樹對象訪問節點的標籤信息。您應該查看edu.stanford.nlp.trees.Tree的javadoc以瞭解如何訪問此信息。

而且在這個答案我有一些代碼,顯示了訪問樹:

How to get NN andNNS from a text?

你想看看每個樹和樹的標籤()來得到一個節點的標籤。

這裏是BuildBinarizedDataset.java GitHub上的參考:

https://github.com/stanfordnlp/CoreNLP/blob/master/src/edu/stanford/nlp/sentiment/BuildBinarizedDataset.java

請讓我知道,如果有什麼不清楚這件事,我可以提供進一步的幫助!

+1

對不起,也許我誤導了你的問題。我編輯過它。我想爲每個節點添加POS標記信息。如'(NP(ADJP(IN near))(DT the)(NN end))「' – Matthew

0

首先,你需要下載Stanford Parser

設置

private LexicalizedParser parser; 
private TreeBinarizer binarizer; 
private CollapseUnaryTransformer transformer; 

parser = LexicalizedParser.loadModel(PCFG_PATH); 
binarizer = TreeBinarizer.simpleTreeBinarizer(
     parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); 
transformer = new CollapseUnaryTransformer(); 

解析

Tree tree = parser.apply(tokens); 

訪問POSTAG

public String[] constTreePOSTAG(Tree tree) { 
    Tree binarized = binarizer.transformTree(tree); 
    Tree collapsedUnary = transformer.transformTree(binarized); 
    Trees.convertToCoreLabels(collapsedUnary); 
    collapsedUnary.indexSpans(); 
    List<Tree> leaves = collapsedUnary.getLeaves(); 
    int size = collapsedUnary.size() - leaves.size(); 
    String[] tags = new String[size]; 
    HashMap<Integer, Integer> index = new HashMap<Integer, Integer>(); 

    int idx = leaves.size(); 
    int leafIdx = 0; 
    for (Tree leaf : leaves) { 
     Tree cur = leaf.parent(collapsedUnary); // go to preterminal 
     int curIdx = leafIdx++; 
     boolean done = false; 
     while (!done) { 
     Tree parent = cur.parent(collapsedUnary); 
     if (parent == null) { 
      tags[curIdx] = cur.label().toString(); 
      break; 
     } 

     int parentIdx; 
     int parentNumber = parent.nodeNumber(collapsedUnary); 
     if (!index.containsKey(parentNumber)) { 
      parentIdx = idx++; 
      index.put(parentNumber, parentIdx); 
     } else { 
      parentIdx = index.get(parentNumber); 
      done = true; 
     } 

     tags[curIdx] = parent.label().toString(); 
     cur = parent; 
     curIdx = parentIdx; 
     } 
    } 

    return tags; 
    } 

下面是完整的S烏爾斯河代碼ConstituencyParse.java運行: 使用PARAM: 的Java ConstituencyParse -tokpath outputtoken.toks -parentpath outputparent.txt -tagpath outputag.txt < input_sentence_in_text_file_one_sent_per_line.txt

(注:源代碼是從treelstm repo適應,你還需要更換preprocess-sst.py以致電ConstituencyParse。java文件)

import edu.stanford.nlp.process.WordTokenFactory; 
import edu.stanford.nlp.ling.HasWord; 
import edu.stanford.nlp.ling.Word; 
import edu.stanford.nlp.ling.CoreLabel; 
import edu.stanford.nlp.ling.TaggedWord; 
import edu.stanford.nlp.process.PTBTokenizer; 
import edu.stanford.nlp.util.StringUtils; 
import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 
import edu.stanford.nlp.parser.lexparser.TreeBinarizer; 
import edu.stanford.nlp.tagger.maxent.MaxentTagger; 
import edu.stanford.nlp.trees.GrammaticalStructure; 
import edu.stanford.nlp.trees.GrammaticalStructureFactory; 
import edu.stanford.nlp.trees.PennTreebankLanguagePack; 
import edu.stanford.nlp.trees.Tree; 
import edu.stanford.nlp.trees.Trees; 
import edu.stanford.nlp.trees.TreebankLanguagePack; 
import edu.stanford.nlp.trees.TypedDependency; 

import java.io.BufferedWriter; 
import java.io.FileWriter; 
import java.io.StringReader; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.Collection; 
import java.util.List; 
import java.util.HashMap; 
import java.util.Properties; 
import java.util.Scanner; 

public class ConstituencyParse { 

    private boolean tokenize; 
    private BufferedWriter tokWriter, parentWriter, tagWriter; 
    private LexicalizedParser parser; 
    private TreeBinarizer binarizer; 
    private CollapseUnaryTransformer transformer; 
    private GrammaticalStructureFactory gsf; 

    private static final String PCFG_PATH = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; 

    public ConstituencyParse(String tokPath, String parentPath, String tagPath, boolean tokenize) throws IOException { 
    this.tokenize = tokenize; 
    if (tokPath != null) { 
     tokWriter = new BufferedWriter(new FileWriter(tokPath)); 
    } 
    parentWriter = new BufferedWriter(new FileWriter(parentPath)); 
    tagWriter = new BufferedWriter(new FileWriter(tagPath)); 
    parser = LexicalizedParser.loadModel(PCFG_PATH); 
    binarizer = TreeBinarizer.simpleTreeBinarizer(
     parser.getTLPParams().headFinder(), parser.treebankLanguagePack()); 
    transformer = new CollapseUnaryTransformer(); 


    // set up to produce dependency representations from constituency trees 
    TreebankLanguagePack tlp = new PennTreebankLanguagePack(); 
    gsf = tlp.grammaticalStructureFactory(); 
    } 

    public List<HasWord> sentenceToTokens(String line) { 
    List<HasWord> tokens = new ArrayList<>(); 
    if (tokenize) { 
     PTBTokenizer<Word> tokenizer = new PTBTokenizer(new StringReader(line), new WordTokenFactory(), ""); 
     for (Word label; tokenizer.hasNext();) { 
     tokens.add(tokenizer.next()); 
     } 
    } else { 
     for (String word : line.split(" ")) { 
     tokens.add(new Word(word)); 
     } 
    } 

    return tokens; 
    } 

    public Tree parse(List<HasWord> tokens) { 
    Tree tree = parser.apply(tokens); 
    return tree; 
    } 

public String[] constTreePOSTAG(Tree tree) { 
    Tree binarized = binarizer.transformTree(tree); 
    Tree collapsedUnary = transformer.transformTree(binarized); 
    Trees.convertToCoreLabels(collapsedUnary); 
    collapsedUnary.indexSpans(); 
    List<Tree> leaves = collapsedUnary.getLeaves(); 
    int size = collapsedUnary.size() - leaves.size(); 
    String[] tags = new String[size]; 
    HashMap<Integer, Integer> index = new HashMap<Integer, Integer>(); 

    int idx = leaves.size(); 
    int leafIdx = 0; 
    for (Tree leaf : leaves) { 
     Tree cur = leaf.parent(collapsedUnary); // go to preterminal 
     int curIdx = leafIdx++; 
     boolean done = false; 
     while (!done) { 
     Tree parent = cur.parent(collapsedUnary); 
     if (parent == null) { 
      tags[curIdx] = cur.label().toString(); 
      break; 
     } 

     int parentIdx; 
     int parentNumber = parent.nodeNumber(collapsedUnary); 
     if (!index.containsKey(parentNumber)) { 
      parentIdx = idx++; 
      index.put(parentNumber, parentIdx); 
     } else { 
      parentIdx = index.get(parentNumber); 
      done = true; 
     } 

     tags[curIdx] = parent.label().toString(); 
     cur = parent; 
     curIdx = parentIdx; 
     } 
    } 

    return tags; 
    } 

    public int[] constTreeParents(Tree tree) { 
    Tree binarized = binarizer.transformTree(tree); 
    Tree collapsedUnary = transformer.transformTree(binarized); 
    Trees.convertToCoreLabels(collapsedUnary); 
    collapsedUnary.indexSpans(); 
    List<Tree> leaves = collapsedUnary.getLeaves(); 
    int size = collapsedUnary.size() - leaves.size(); 
    int[] parents = new int[size]; 
    HashMap<Integer, Integer> index = new HashMap<Integer, Integer>(); 

    int idx = leaves.size(); 
    int leafIdx = 0; 
    for (Tree leaf : leaves) { 
     Tree cur = leaf.parent(collapsedUnary); // go to preterminal 
     int curIdx = leafIdx++; 
     boolean done = false; 
     while (!done) { 
     Tree parent = cur.parent(collapsedUnary); 
     if (parent == null) { 
      parents[curIdx] = 0; 
      break; 
     } 

     int parentIdx; 
     int parentNumber = parent.nodeNumber(collapsedUnary); 
     if (!index.containsKey(parentNumber)) { 
      parentIdx = idx++; 
      index.put(parentNumber, parentIdx); 
     } else { 
      parentIdx = index.get(parentNumber); 
      done = true; 
     } 

     parents[curIdx] = parentIdx + 1; 
     cur = parent; 
     curIdx = parentIdx; 
     } 
    } 

    return parents; 
    } 

    // convert constituency parse to a dependency representation and return the 
    // parent pointer representation of the tree 
    public int[] depTreeParents(Tree tree, List<HasWord> tokens) { 
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); 
    Collection<TypedDependency> tdl = gs.typedDependencies(); 
    int len = tokens.size(); 
    int[] parents = new int[len]; 
    for (int i = 0; i < len; i++) { 
     // if a node has a parent of -1 at the end of parsing, then the node 
     // has no parent. 
     parents[i] = -1; 
    } 

    for (TypedDependency td : tdl) { 
     // let root have index 0 
     int child = td.dep().index(); 
     int parent = td.gov().index(); 
     parents[child - 1] = parent; 
    } 

    return parents; 
    } 

    public void printTokens(List<HasWord> tokens) throws IOException { 
    int len = tokens.size(); 
    StringBuilder sb = new StringBuilder(); 
    for (int i = 0; i < len - 1; i++) { 
     if (tokenize) { 
     sb.append(PTBTokenizer.ptbToken2Text(tokens.get(i).word())); 
     } else { 
     sb.append(tokens.get(i).word()); 
     } 
     sb.append(' '); 
    } 

    if (tokenize) { 
     sb.append(PTBTokenizer.ptbToken2Text(tokens.get(len - 1).word())); 
    } else { 
     sb.append(tokens.get(len - 1).word()); 
    } 

    sb.append('\n'); 
    tokWriter.write(sb.toString()); 
    } 

    public void printParents(int[] parents) throws IOException { 
    StringBuilder sb = new StringBuilder(); 
    int size = parents.length; 
    for (int i = 0; i < size - 1; i++) { 
     sb.append(parents[i]); 
     sb.append(' '); 
    } 
    sb.append(parents[size - 1]); 
    sb.append('\n'); 
    parentWriter.write(sb.toString()); 
    } 

    public void printTags(String[] tags) throws IOException { 
    StringBuilder sb = new StringBuilder(); 
    int size = tags.length; 
    for (int i = 0; i < size - 1; i++) { 
     sb.append(tags[i]); 
     sb.append(' '); 
    } 
    sb.append(tags[size - 1]); 
    sb.append('\n'); 
    tagWriter.write(sb.toString().toLowerCase()); 
    } 

    public void close() throws IOException { 
    if (tokWriter != null) tokWriter.close(); 
    parentWriter.close(); 
    tagWriter.close(); 
    } 

    public static void main(String[] args) throws Exception { 
    String TAGGER_MODEL = "stanford-tagger/models/english-left3words-distsim.tagger"; 
    Properties props = StringUtils.argsToProperties(args); 
    if (!props.containsKey("parentpath")) { 
     System.err.println(
     "usage: java ConstituencyParse -deps - -tokenize - -tokpath <tokpath> -parentpath <parentpath>"); 
     System.exit(1); 
    } 

    // whether to tokenize input sentences 
    boolean tokenize = false; 
    if (props.containsKey("tokenize")) { 
     tokenize = true; 
    } 

    // whether to produce dependency trees from the constituency parse 
    boolean deps = false; 
    if (props.containsKey("deps")) { 
     deps = true; 
    } 

    String tokPath = props.containsKey("tokpath") ? props.getProperty("tokpath") : null; 
    String parentPath = props.getProperty("parentpath"); 
    String tagPath = props.getProperty("tagpath"); 

    ConstituencyParse processor = new ConstituencyParse(tokPath, parentPath, tagPath, tokenize); 

    Scanner stdin = new Scanner(System.in); 
    int count = 0; 
    long start = System.currentTimeMillis(); 
    while (stdin.hasNextLine() && count < 2) { 
     String line = stdin.nextLine(); 
     List<HasWord> tokens = processor.sentenceToTokens(line); 

     //end tagger 

     Tree parse = processor.parse(tokens); 

     // produce parent pointer representation 
     int[] parents = deps ? processor.depTreeParents(parse, tokens) 
          : processor.constTreeParents(parse); 

     String[] tags = processor.constTreePOSTAG(parse); 

     // print 
     if (tokPath != null) { 
     processor.printTokens(tokens); 
     } 
     processor.printParents(parents); 
     processor.printTags(tags); 
     // print tag 
     StringBuilder sb = new StringBuilder(); 
     int size = tags.length; 
     for (int i = 0; i < size - 1; i++) { 
     sb.append(tags[i]); 
     sb.append(' '); 
     } 
     sb.append(tags[size - 1]); 
     sb.append('\n'); 


     count++; 
     if (count % 100 == 0) { 
     double elapsed = (System.currentTimeMillis() - start)/1000.0; 
     System.err.printf("Parsed %d lines (%.2fs)\n", count, elapsed); 
     } 
    } 

    long totalTimeMillis = System.currentTimeMillis() - start; 
    System.err.printf("Done: %d lines in %.2fs (%.1fms per line)\n", 
     count, totalTimeMillis/100.0, totalTimeMillis/(double) count); 
    processor.close(); 
    } 
}