2010-10-24 28 views

回答

4

Hindi Analyzer,與詞幹,可在Lucene的。它基於這個algorithm(pdf)。

+0

您可以更新鏈接,您provided.thanks – 2015-03-04 05:58:32

3

hindi_stemmer是Ananthakrishnan Ramanathan和Durgesh D Rao在「A Lightweight Stemmer for Hindi」中描述的印地語詞幹的Python實現。

+0

我是使用它,但它給我錯誤。該怎麼辦? – 2015-02-28 18:07:03

+0

可能你不再看着這個,但是錯誤是什麼? – 2016-07-24 21:39:24

0

我們創造的原代碼Python Hindi Stemmer

雖然原來的印地文詞幹的作者的方式使用可變升的Java版本還不是很清楚,但有一個完整的代碼什麼工作

進口java.util中。數組列表; import org.apache.commons.lang.StringUtils;

public class SimpleHindiStemmer { 

/* This is not coming while pasting the code better copy from Python Code as given in suffixes[1] */ 
private static String [] stem1 = new String [] { "ो", "े", "ू", "ु", "ी", "ि", "ा" }; 

/* This is not coming while pasting the code better copy from Python Code as given in suffixes[2] */ 
private static String [] stem2 = new String [] { "कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें" }; 

private static String [] stem3 = new String [] { "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं" }; 

private static String [] stem4 = new String [] { "ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां" }; 

private static String [] stem5 = new String [] { "ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां" }; 

private static ArrayList<String []> stemList = new ArrayList<String []>(5); 

static int [] cut = new int [] { 0, 1, 1, 1, 2, 2 }; 

static { 

    stemList.add(stem5); 
    stemList.add(stem4); 
    stemList.add(stem3); 
    stemList.add(stem2); 
    stemList.add(stem1); 

} 

public SimpleHindiStemmer(){ 

} 

public static void main(String [] argv){ 

    SimpleHindiStemmer sm = new SimpleHindiStemmer(); 
    String word = "रास्ते"; 
    System.out.println(sm.stemprocess(word)); 
} 

public String stemprocess(String word){ 


    int wlen = word.length(); 
    int wordlen = wlen*3; 
    int icnt = 5; 
    for (String [] stemwords : stemList){ 
     if(wordlen > (icnt + 1)){ 
      for(String sw: stemwords){ 
       if(StringUtils.endsWith(word, sw)){ 
        return StringUtils.substring(word, 0, wlen - cut[icnt]); 

       } 
      } 

     } 


     icnt--; 
    } 

    return word; 
} 
} 

正如你所看到的utf-8字符在某些情況下沒有很好的捕獲。看看原始的Python代碼,並從那裏複製後綴值。

0
import java.util.Map; 
import java.util.WeakHashMap; 

/** * 印地文光stemmer-刪除號碼,性別和情況下,從名詞後綴和形容詞

public class HindiStemmerLight{ 

/** 
* A cache of words and their stems 
*/ 
static private Map<String, String> cache = new WeakHashMap<String, String>(); 

/** 
* A buffer of the current word being stemmed 
*/ 
private StringBuilder sb = new StringBuilder(); 

/** 
* Default constructor 
*/ 
public HindiStemmerLight() { 
} 

public String stem(String word) { 
    String result = cache.get(word); 

    if (result != null) 
     return result; 

    // 
    sb.delete(0, sb.length()); 

    // 
    sb.append(word); 

    /* remove the case endings from nouns and adjectives */ 
    remove_suffix(sb); 



    result = sb.toString(); 
    cache.put(word, result); 

    return result; 
} 

private void remove_suffix(StringBuilder word) { 
    int len = word.length() - 1; 

    /* article */ 

    if (len > 4) { 
     if (word.substring(len- 2, len+1).equals("िया")) { 
      word.delete(len-2 , len + 1); 
      return; 
     } 
     if (word.substring(len- 2, len+1).equals("ियो")) { 
      word.delete(len-2 , len + 1); 
      return; 
     } 

    } /* end if len >4 */ 
    if (len > 3) { 
     if (word.substring(len-1, len+1).equals("ाए")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals(" ाओ")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals(" ुआ")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals(" ुओ")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len- 1, len+1).equals("ये")) { 
      word.delete(len-1 , len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals(" ेन")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals(" ेण")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len- 1, len+1).equals(" ीय")) { 
      word.delete(len-1 , len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals("टी")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals("ार")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 
     if (word.substring(len-1, len+1).equals("ाई")) { 
      word.delete(len - 1, len + 1); 
      return; 
     } 

    } /* end if len > 3 */ 
    if (len > 2) { 
     if (word.substring(len, len+1).equals(" ा")) { 
      word.delete(len , len + 1); 
      return; 
     } 
     if (word.substring(len, len+1).equals(" े")) { 
      word.delete(len , len + 1); 
      return; 
     } 
     if (word.substring(len, len+1).equals(" ी")) { 
      word.delete(len , len + 1); 
      return; 
     } 
     if (word.substring(len, len+1).equals(" ो")) { 
      word.delete(len , len + 1); 
      return; 
     } 
     if (word.substring(len, len+1).equals("ि ")) { 
      word.delete(len , len + 1); 
      return; 
     } 
     if (word.substring(len, len+1).equals("अ")) { 
      word.delete(len , len + 1); 
      return; 
     } 

    } /* end if len > 2 */ 
    return; 
} 

}

相關問題