1
我使用StanfordCoreNLP 2.4.1星火1.5解析中國的句子,但碰到了Java堆OOM異常。代碼如下所示:火花斯坦福解析器出現內存不足的
val modelpath = "edu/stanford/nlp/models/lexparser/xinhuaFactored.ser.gz"
val lp = LexicalizedParser.loadModel(modelpath)
val dataWords = data.map(x=>{
val tokens = x.split("\t")
val id = tokens(0)
val word_seg = tokens(2)
val comm_words = word_seg.split("\1").filter(_.split(":").length == 2).map(y=>(y.split(":")(0), y.split(":")(1)))
(id, comm_words)
}).filter(_._2.nonEmpty)
val dataSenSlice = dataWords.map(x=>{
val id = x._1
val comm_words = x._2
val punctuationIndex = Array(0) ++ comm_words.zipWithIndex.filter(_._1._2 == "34").map(_._2) ++ Array(comm_words.length - 1)
val senIndex = (punctuationIndex zip punctuationIndex.tail).filter(z => z._1 != z._2)
val senSlice = senIndex.map(z=>{
val begin = if (z._1 > 0) z._1 + 1 else z._1
val end = if (z._2 == comm_words.length - 1) z._2 + 1 else z._2
if (comm_words.slice(begin, end).filter(_._2 != "34").nonEmpty) {
val sen = comm_words.slice(begin, end).filter(_._2 != "34").map(_._1).mkString(" ").trim
sen
} else ""
}).filter(l=>l.nonEmpty && l.length<20)
(id, senSlice)
}).filter(_._2.nonEmpty)
val dataPoint = dataSenSlice.map(x=>{
val id = x._1
val senSlice = x._2
val senParse = senSlice.map(y=>{
StanfordNLPParser.senParse(lp, y)// java code wrapped sentence parser
})
id + "\t" + senParse.mkString("\1")
})
dataPoint.saveAsTextFile(PARSED_MERGED_POI)
我給分析器提供的句子是一個由分段詞使用空格連接的句子。
我遇到的例外是:
17/08/09 10:28:15 WARN TaskSetManager: Lost task 1062.0 in stage 0.0 (TID 1219, rz-data-hdp-dn15004.rz.******.com): java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.regex.Pattern.union(Pattern.java:5149)
at java.util.regex.Pattern.clazz(Pattern.java:2513)
at java.util.regex.Pattern.sequence(Pattern.java:2030)
at java.util.regex.Pattern.expr(Pattern.java:1964)
at java.util.regex.Pattern.compile(Pattern.java:1665)
at java.util.regex.Pattern.<init>(Pattern.java:1337)
at java.util.regex.Pattern.compile(Pattern.java:1022)
at java.util.regex.Pattern.matches(Pattern.java:1128)
at java.lang.String.matches(String.java:2063)
at edu.stanford.nlp.parser.lexparser.ChineseUnknownWordModel.score(ChineseUnknownWordModel.java:97)
at edu.stanford.nlp.parser.lexparser.BaseUnknownWordModel.score(BaseUnknownWordModel.java:124)
at edu.stanford.nlp.parser.lexparser.ChineseLexicon.score(ChineseLexicon.java:54)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1602)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1634)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
at edu.stanford.nlp.parser.lexparser.ExhaustivePCFGParser.extractBestParse(ExhaustivePCFGParser.java:1635)
我想知道如果我用正確的方式做句子解析,或者一些其他的東西是錯誤的。
問題解決了,非常感謝! – guan