如何在OpenNLP中讀取命名實體識別的文檔

我是java的新手，對我的要求是讀取文檔並執行命名實體文檔。對於簡單的字符串，我做了以下操作如何在OpenNLP中讀取命名實體識別的文檔

InputStream is = new FileInputStream("data/en-ner-person.bin"); 
TokenNameFinderModel model = new TokenNameFinderModel(is); 
is.close(); 
NameFinderME nameFinder = new NameFinderME(model); 
String []sentence = new String[]{"Smith", 
       "Smithosian", 
       "is", 
       "a", 
       "person" 
       }; 



    Span nameSpans[] = nameFinder.find(sentence);

但是，我需要從文檔中實際讀取流，然後生成XML。誰能告訴我該怎麼做

感謝

來源

2013-10-10 Yogi

永遠沒有人回答了這個，所以我希望這不是爲時已晚。

對於實體提取，您需要具有字符串格式的文檔文本。檢查stackoverflow的許多方法來獲取文本文本到字符串（這裏的簡短答案是使用BufferedInputStream文本文件，或Apache Tika MS和PDF文件）

一旦你有內存中的doc文本，這段代碼應該爲您提供句子邊界檢測，標記化和NER。然後採用這種結果並使用docname/docid，可能某些文件元數據，以及實際實體字符串，類型和Span（任何東西在文本中的位置）以您想要的任何方式生成xmlDoc

這個類應該讓你開始

package processors; 

import java.io.File; 
import java.io.FileInputStream; 
import java.io.InputStream; 
import java.util.ArrayList; 
import java.util.List; 
import opennlp.tools.namefind.NameFinderME; 
import opennlp.tools.namefind.TokenNameFinderModel; 
import opennlp.tools.sentdetect.SentenceDetector; 
import opennlp.tools.sentdetect.SentenceDetectorME; 
import opennlp.tools.sentdetect.SentenceModel; 
import opennlp.tools.tokenize.TokenizerME; 
import opennlp.tools.tokenize.TokenizerModel; 
import opennlp.tools.util.Span; 

public class OpenNLPNER implements Runnable 
{ 

    static TokenizerModel tm = null; 
    static TokenNameFinderModel locModel = null; 
    String doc; 
    NameFinderME myNameFinder; 
    TokenizerME wordBreaker; 
    SentenceDetector sd; 

    public OpenNLPNER() 
    { 
    } 

    public OpenNLPNER(String document, SentenceDetector sd, NameFinderME mf, TokenizerME wordBreaker) 
    { 
     System.out.println("got doc"); 
     this.sd = sd; 
     this.myNameFinder = mf; 
     this.wordBreaker = wordBreaker; 
     doc = document; 
    } 

    private static List<String> getMyDocsFromSomewhere() 
    { 
     //this should return an object that has all the info about the doc you want 
     return new ArrayList<String>(); 
    } 

    public static void main(String[] args) 
    { 
     try 
     { 
      String modelPath = "c:\\temp\\opennlpmodels\\"; 

      if (tm == null) 
      { 
       //user does normal namefinder instantiations... 
       InputStream stream = new FileInputStream(new File(modelPath + "en-token.zip")); 
       // new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip")))); 
       tm = new TokenizerModel(stream); 
       // new TokenizerME(tm); 
       locModel = new TokenNameFinderModel(new FileInputStream(new File(modelPath + "en-ner-location.bin"))); 
       // new NameFinderME(locModel); 
      } 


      System.out.println("getting data"); 
      List<String> docs = getMyDocsFromSomewhere(); 
      System.out.println("\tdone getting data"); 
      // FileWriter fw = new FileWriter("C:\\apache\\modelbuilder\\sentences.txt"); 




      for (String docu : docs) 
      { 
       //you could also use the runnable here and launch in a diff thread 
       new OpenNLPNER(docu, 
         new SentenceDetectorME(new SentenceModel(new FileInputStream(new File(modelPath + "en-sent.zip")))), 
         new NameFinderME(locModel), new TokenizerME(tm)).run(); 

      } 

      System.out.println("done"); 


     } catch (Exception ex) 
     { 
      System.out.println(ex); 
     } 


    } 

    @Override 
    public void run() 
    { 
     try 
     { 
      process(doc); 
     } catch (Exception ex) 
     { 
      System.out.println(ex); 
     } 
    } 

    public void process(String document) throws Exception 
    { 

     // System.out.println(document); 
     //user instantiates the non static entitylinkerproperty object and constructs is with a pointer to the prop file they need to use 
     String modelPath = "C:\\apache\\entitylinker\\"; 


     //input document 
     myNameFinder.clearAdaptiveData(); 
     //user splits doc to sentences 
     String[] sentences = sd.sentDetect(document); 
     //get the sentence spans 
     Span[] sentenceSpans = sd.sentPosDetect(document); 
     Span[][] allnamesInDoc = new Span[sentenceSpans.length][]; 
     String[][] allTokensInDoc = new String[sentenceSpans.length][]; 

     for (int sentenceIndex = 0; sentenceIndex < sentences.length; sentenceIndex++) 
     { 
      String[] stringTokens = wordBreaker.tokenize(sentences[sentenceIndex]); 
      Span[] tokenSpans = wordBreaker.tokenizePos(sentences[sentenceIndex]); 
      Span[] spans = myNameFinder.find(stringTokens); 
      allnamesInDoc[sentenceIndex] = spans; 
      allTokensInDoc[sentenceIndex] = stringTokens; 
     } 

     //now access the data like this... 
     for (int s = 0; s < sentenceSpans.length; s++) 
     { 
      Span[] namesInSentence = allnamesInDoc[s]; 
      String[] tokensInSentence = allTokensInDoc[s]; 
      String[] entities = Span.spansToStrings(namesInSentence, tokensInSentence); 
      for (String entity : entities) 
      { 
       //start building up the XML here.... 
       System.out.println(entity + " Was in setnence " + s + " @ " + namesInSentence[s].toString()); 
      } 
     } 

    } 
}

來源

2014-01-16 12:28:44 markg

如何在OpenNLP中讀取命名實體識別的文檔

回答

相關問題