2013-07-31 42 views

回答

1

我相信教練和分類允許自定義featuregenerators其方法,但是它們必須是FeatureGenerator的實現,並且BigramFeatureGenerator不是這樣的實現。於是我做了一個快速IMPL如下一個內部類..所以當你得到一個機會

import java.io.FileInputStream; 
    import java.io.IOException; 
    import java.io.InputStream; 
    import java.util.ArrayList; 
    import java.util.Arrays; 
    import java.util.Collection; 
    import java.util.Collections; 
    import java.util.List; 
    import opennlp.tools.doccat.DoccatModel; 
    import opennlp.tools.doccat.DocumentCategorizerME; 
    import opennlp.tools.doccat.DocumentSample; 
    import opennlp.tools.doccat.DocumentSampleStream; 
    import opennlp.tools.doccat.FeatureGenerator; 
    import opennlp.tools.util.ObjectStream; 
    import opennlp.tools.util.PlainTextByLineStream; 



    public class DoccatUsingBigram { 

     public static void main(String[] args) throws IOException { 
     InputStream dataIn = new FileInputStream(args[0]); 
     try { 


      ObjectStream<String> lineStream = 
        new PlainTextByLineStream(dataIn, "UTF-8"); 
//here you can use it as part of building the model 
      ObjectStream<DocumentSample> sampleStream = new DocumentSampleStream(lineStream); 
      DoccatModel model = DocumentCategorizerME.train("en", sampleStream, 10, 100, new MyBigramFeatureGenerator()); 


      ///now you would use it like this 

      DocumentCategorizerME classifier = new DocumentCategorizerME(model); 
      String[] someData = "whatever you are trying to classify".split(" "); 
      Collection<String> bigrams = new MyBigramFeatureGenerator().extractFeatures(someData); 
      double[] categorize = classifier.categorize(bigrams.toArray(new String[bigrams.size()])); 


     } catch (IOException e) { 
      // Failed to read or parse training data, training failed 
      e.printStackTrace(); 
     } 

     } 

     public static class MyBigramFeatureGenerator implements FeatureGenerator { 

     @Override 
     public Collection<String> extractFeatures(String[] text) { 
      return generate(Arrays.asList(text), 2, ""); 
     } 

     private List<String> generate(List<String> input, int n, String separator) { 

      List<String> outGrams = new ArrayList<String>(); 
      for (int i = 0; i < input.size() - (n - 2); i++) { 
      String gram = ""; 
      if ((i + n) <= input.size()) { 
       for (int x = i; x < (n + i); x++) { 
       gram += input.get(x) + separator; 
       } 
       gram = gram.substring(0, gram.lastIndexOf(separator)); 
       outGrams.add(gram); 
      } 
      } 
      return outGrams; 
     } 
     } 
    } 

希望這有助於...

相關問題