\t at java.util.ArrayList$Itr.checkForComodification(ArrayList.java:901) 
\t at java.util.ArrayList$Itr.next(ArrayList.java:851) 
\t at java.util.Collections$UnmodifiableCollection$1.next(Collections.java:1042) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:463) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488) 
\t at edu.stanford.nlp.trees.GrammaticalStructure.<init>(GrammaticalStructure.java:201) 
\t at edu.stanford.nlp.trees.EnglishGrammaticalStructure.<init>(EnglishGrammaticalStructure.java:89) 
\t at edu.stanford.nlp.semgraph.SemanticGraphFactory.makeFromTree(SemanticGraphFactory.java:139) 
\t at edu.stanford.nlp.pipeline.DeterministicCorefAnnotator.annotate(DeterministicCorefAnnotator.java:89) 
\t at edu.stanford.nlp.pipeline.AnnotationPipeline.annotate(AnnotationPipeline.java:68) 
\t at edu.stanford.nlp.pipeline.StanfordCoreNLP.annotate(StanfordCoreNLP.java:412)

我附加能重現問題在大約20秒對我的核心應用程序的示例代碼i3 370M筆記本電腦(Win 7 64bit,Java 64bit)。此應用程序讀取識別文本語言(RTE)語料庫的XML文件,然後使用標準Java併發類同時解析所有語句。需要將本地RTE XML文件的路徑作爲命令行參數提供。在我的測試中,我使用這裏的公開可用的XML文件: http://www.nist.gov/tac/data/RTE/RTE3-DEV-FINAL.tar.gz

package semante.parser.stanford.server; 

import java.io.FileInputStream; 
import java.io.InputStreamReader; 
import java.io.OutputStream; 
import java.io.PrintStream; 
import java.nio.charset.StandardCharsets; 
import java.util.ArrayList; 
import java.util.List; 
import java.util.Properties; 
import java.util.concurrent.ExecutorService; 
import java.util.concurrent.Executors; 
import java.util.concurrent.TimeUnit; 
import java.util.concurrent.atomic.AtomicInteger; 

import javax.xml.bind.JAXBContext; 
import javax.xml.bind.Unmarshaller; 
import javax.xml.bind.annotation.XmlAccessType; 
import javax.xml.bind.annotation.XmlAccessorType; 
import javax.xml.bind.annotation.XmlAttribute; 
import javax.xml.bind.annotation.XmlElement; 
import javax.xml.bind.annotation.XmlRootElement; 

import edu.stanford.nlp.pipeline.Annotation; 
import edu.stanford.nlp.pipeline.StanfordCoreNLP; 

public class StanfordMultiThreadingTest { 

\t @XmlRootElement(name = "entailment-corpus") 
\t @XmlAccessorType (XmlAccessType.FIELD) 
\t public static class Corpus { 
\t \t @XmlElement(name = "pair") 
\t \t private List<Pair> pairList = new ArrayList<Pair>(); 

\t \t public void addPair(Pair p) {pairList.add(p);} 
\t \t public List<Pair> getPairList() {return pairList;} 
\t } 

\t @XmlRootElement(name="pair") 
\t public static class Pair { 

\t \t @XmlAttribute(name = "id") 
\t \t String id; 

\t \t @XmlAttribute(name = "entailment") 
\t \t String entailment; 

\t \t @XmlElement(name = "t") 
\t \t String t; 

\t \t @XmlElement(name = "h") 
\t \t String h; 

\t \t private Pair() {} 

\t \t public Pair(int id, boolean entailment, String t, String h) { 
\t \t \t this(); 
\t \t \t this.id = Integer.toString(id); 
\t \t \t this.entailment = entailment ? "YES" : "NO"; 
\t \t \t this.t = t; 
\t \t \t this.h = h; 
\t \t } 

\t \t public String getId() {return id;} 
\t \t public String getEntailment() {return entailment;} 
\t \t public String getT() {return t;} 
\t \t public String getH() {return h;} 
\t } 
\t class NullStream extends OutputStream { 
\t \t @Override 
\t \t public void write(int b) {} 
\t }; 

\t private Corpus corpus; 
\t private Unmarshaller unmarshaller; 
\t private ExecutorService executor; 

\t public StanfordMultiThreadingTest() throws Exception { 
\t \t javax.xml.bind.JAXBContext jaxbCtx = JAXBContext.newInstance(Pair.class,Corpus.class); 
\t \t unmarshaller = jaxbCtx.createUnmarshaller(); 
\t \t executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()); 
\t } 

\t public void readXML(String fileName) throws Exception { 
\t \t System.out.println("Reading XML - Started"); 
\t \t corpus = (Corpus) unmarshaller.unmarshal(new InputStreamReader(new FileInputStream(fileName), StandardCharsets.UTF_8)); 
\t \t System.out.println("Reading XML - Ended"); 
\t } 

\t public void parseSentences() throws Exception { 
\t \t System.out.println("Parsing - Started"); 

\t \t // turn pairs into a list of sentences 
\t \t List<String> sentences = new ArrayList<String>(); 
\t \t for (Pair pair : corpus.getPairList()) { 
\t \t \t sentences.add(pair.getT()); 
\t \t \t sentences.add(pair.getH()); 
\t \t } 

\t \t // prepare the properties 
\t \t final Properties props = new Properties(); 
\t \t props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); 

\t \t // first run is long since models are loaded 
\t \t new StanfordCoreNLP(props); 

\t \t // to avoid the CoreNLP initialization prints (e.g. "Adding annotation pos") 
\t \t final PrintStream nullPrintStream = new PrintStream(new NullStream()); 
\t \t PrintStream err = System.err; 
\t \t System.setErr(nullPrintStream); 

\t \t int totalCount = sentences.size(); 
\t \t AtomicInteger counter = new AtomicInteger(0); 

\t \t // use java concurrency to parallelize the parsing 
\t \t for (String sentence : sentences) { 
\t \t \t executor.execute(new Runnable() { 
\t \t \t \t @Override 
\t \t \t \t public void run() { 
\t \t \t \t \t try { 
\t \t \t \t \t \t StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 
\t \t \t \t \t \t Annotation annotation = new Annotation(sentence); 
\t \t \t \t \t \t pipeline.annotate(annotation); 
\t \t \t \t \t \t if (counter.incrementAndGet() % 20 == 0) { 
\t \t \t \t \t \t \t System.out.println("Done: " + String.format("%.2f", counter.get()*100/(double)totalCount)); 
\t \t \t \t \t \t }; 
\t \t \t \t \t } catch (Exception e) { 
\t \t \t \t \t \t System.setErr(err); 
\t \t \t \t \t \t e.printStackTrace(); 
\t \t \t \t \t \t System.setErr(nullPrintStream); 
\t \t \t \t \t \t executor.shutdownNow(); 
\t \t \t \t \t } 
\t \t \t \t } 
\t \t \t }); 
\t \t } 
\t \t executor.shutdown(); 
\t \t 
\t \t System.out.println("Waiting for parsing to end."); \t \t 
\t \t executor.awaitTermination(10, TimeUnit.MINUTES); 

\t \t System.out.println("Parsing - Ended"); 
\t } 

\t public static void main(String[] args) throws Exception { 
\t \t StanfordMultiThreadingTest smtt = new StanfordMultiThreadingTest(); 
\t \t smtt.readXML(args[0]); 
\t \t smtt.parseSentences(); 
\t } 


在我試圖找到一些背景資料,我遇到了來自斯坦福大學的Christopher ManningGabor Angeli給出答案,這表明當代版本斯坦福CoreNLP應該是線程安全的。然而,CoreNLP 3.4.1版最近的bug report描述了一個併發問題。正如標題中提到的,我使用的是3.5.2版本。







Properties props = new Properties(); 
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); 
props.put("threads", "8") 
StanfordCoreNLP pipeline = new StanfordCoreNLP(props); 

不過我認爲您的解決方案還應該工作,我們會檢查是否有一些併發的bug ,但使用此選項可能同時解決您的問題。


感謝您的建議。我想嘗試一下,但我不確定如何使用界面。假設我設置了「線索」屬性,我應該如何通過並行註釋句子?使用多個使用StanfordCoreNLP實例的線程?或者通過不同於'annotate()'的方法一次傳遞幾個句子?謝謝! 調用 – Assaf


「Annotation」構造函數的參數實際上不是一個句子,而是整個文檔。在「句子」變量中存儲幾個(或甚至所有)句子,並用「\ n」分隔它們。還要將選項「ssplit.eolonly」設置爲「true」,以防止句子拆分器錯誤地分割實際句子。解析後,註釋對象包含一個句子列表,其中每個句子具有解析,pos,引理等註釋。 –


謝謝,我試過了。然而,要麼註釋由'\ n'分隔的多個句子的模式存在問題,要麼我做錯了什麼。我能夠解析100個句子,但不是1000或2000.當用1000或2000個句子餵食時,對annotate()的調用會無休止地運行。另外,當我用100個句子測試時,1,2或4個線程(我的硬件有4個)在性能上幾乎沒有差別。它比使用單個線程稍微慢一點,並且每次只用一個語句調用annotate()。 – Assaf



另請參閱CoreNLP on Apache Spark


感謝您的更新。我會在他們發佈一個新版本時嘗試它。 – Assaf