Nlp 是否有适用于印度语的词干分析器_Nlp_Stemming_Indic

Nlp 是否有适用于印度语的词干分析器

nlp

Nlp 是否有适用于印度语的词干分析器,nlp,stemming,indic,Nlp,Stemming,Indic,是否有针对印度语言（如印地语、泰卢古语）的词干分析器的实现…在Lucene中提供了带有词干分析器的词干分析器。它基于此（pdf）。是Ananthakrishnan Ramanathan和Durgesh D Rao在“”中描述的印地语词干分析器的Python实现我们创建了原始代码的java版本虽然原始印地语词干分析器的作者使用变量L的方式还没有很好的理解，但是有一个完整的代码可以工作导入java.util.ArrayList；导入org.apache.commons.lang.String

是否有针对印度语言（如印地语、泰卢古语）的词干分析器的实现…

在Lucene中提供了带有词干分析器的词干分析器。它基于此（pdf）。

是Ananthakrishnan Ramanathan和Durgesh D Rao在“”中描述的印地语词干分析器的Python实现

我们创建了原始代码的java版本

虽然原始印地语词干分析器的作者使用变量L的方式还没有很好的理解，但是有一个完整的代码可以工作

导入java.util.ArrayList；导入org.apache.commons.lang.StringUtils

public class SimpleHindiStemmer {

/* This is not coming while pasting the code better copy from Python Code as given in suffixes[1]  */
private static String [] stem1 = new String [] { "ो", "े", "ू", "ु", "ी", "ि", "ा" };

/* This is not coming while pasting the code better copy from Python Code as given in suffixes[2]  */
private static String [] stem2 = new String [] { "कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें" };

private static String [] stem3 = new String [] { "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं" };

private static String [] stem4 = new String [] { "ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां" };

private static String [] stem5 = new String [] { "ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां" };

private static ArrayList<String []> stemList = new ArrayList<String []>(5);

static int [] cut = new int [] { 0, 1, 1, 1, 2, 2 };

static {

    stemList.add(stem5);
    stemList.add(stem4);
    stemList.add(stem3);
    stemList.add(stem2);
    stemList.add(stem1);

}

public SimpleHindiStemmer(){

}

public static void main(String [] argv){

    SimpleHindiStemmer sm = new SimpleHindiStemmer();
    String word = "रास्ते";
    System.out.println(sm.stemprocess(word));
}

public String stemprocess(String word){


    int wlen = word.length();
    int wordlen = wlen*3;
    int icnt = 5;
    for (String [] stemwords : stemList){
        if(wordlen > (icnt + 1)){
            for(String sw: stemwords){
                if(StringUtils.endsWith(word, sw)){
                    return StringUtils.substring(word, 0, wlen - cut[icnt]);

                }
            }

        }


        icnt--;
    }

    return word;
}
}

公共类SimpleHindItemer{
/*粘贴代码时不会出现这种情况，更好地复制Python代码，如后缀[1]中所示*/
私有静态字符串[]stem1=新字符串[]{”ो", "े", "ू", "ु", "ी", "ि", "ा" };
/*粘贴代码时不会出现这种情况。更好地复制Python代码，如后缀[2]中所示*/
私有静态字符串[]stem2=新字符串[]{”कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें" };
私有静态字符串[]stem3=新字符串[]{”ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं" };
私有静态字符串[]stem4=新字符串[]{”ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां" };
私有静态字符串[]stem5=新字符串[]{”ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां" };
私有静态ArrayList stemList=新的ArrayList（5）；
静态int[]cut=newint[]{0,1,1,1,2,2}；
静止的{
stemList.add（stem5）；
stemList.add（stem4）；
stemList.add（stem3）；
stemList.add（stem2）；
stemList.add（stem1）；
}
公共SimpleHindItemer（）{
}
公共静态void main（字符串[]argv）{
SimpleHindItemer sm=新的SimpleHindItemer（）；
字符串字=”रास्ते";
System.out.println（sm.stemprocess（word））；
}
公共字符串stempprocess（字符串字）{
int-wlen=word.length（）；
int-wordlen=wlen*3；
int-icnt=5；
for（字符串[]stemwords:stemList）{
如果（wordlen>（icnt+1））{
用于（字符串sw:stemwords）{
if（StringUtils.endsWith（word，sw））{
返回StringUtils.substring（字，0，wlen-cut[icnt]）；
}
}
}
icnt--；
}
返回词；
}
}

正如您所看到的，utf-8字符在某些情况下并没有很好地捕获。请查看原始python代码，并从中复制后缀值

import java.util.Map;
import java.util.WeakHashMap;

/** *印地语轻型词干分析器-从名词和形容词中删除数字、性别和大小写后缀

public class HindiStemmerLight{

/**
 * A cache of words and their stems
 */
static private Map<String, String> cache = new WeakHashMap<String, String>();

/**
 * A buffer of the current word being stemmed
 */
private StringBuilder sb = new StringBuilder();

/**
 * Default constructor
 */
public HindiStemmerLight() {
}

public String stem(String word) {
    String result = cache.get(word);

    if (result != null)
        return result;

    // 
    sb.delete(0, sb.length());

    // 
    sb.append(word);

    /* remove the case endings from nouns and adjectives */
    remove_suffix(sb);



    result = sb.toString();
    cache.put(word, result);

    return result;
}

private void remove_suffix(StringBuilder word) {
    int len = word.length() - 1;

    /* article */

    if (len > 4) {
        if (word.substring( len- 2, len+1).equals("िया")) {
            word.delete(len-2 , len + 1);
            return;
        }
        if (word.substring( len- 2, len+1).equals("ियो")) {
            word.delete(len-2 , len + 1);
            return;
        }

    } /* end if len >4 */
    if (len > 3) {
        if (word.substring(len-1, len+1).equals("ाए")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals(" ाओ")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals(" ुआ")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals(" ुओ")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring( len- 1, len+1).equals("ये")) {
            word.delete(len-1 , len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals(" ेन")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals(" ेण")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring( len- 1, len+1).equals(" ीय")) {
            word.delete(len-1 , len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals("टी")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals("ार")) {
            word.delete(len - 1, len + 1);
            return;
        }
        if (word.substring(len-1, len+1).equals("ाई")) {
            word.delete(len - 1, len + 1);
            return;
        }

    } /* end if len > 3 */
    if (len > 2) {
        if (word.substring(len, len+1).equals(" ा")) {
            word.delete(len , len + 1);
            return;
        }
        if (word.substring(len, len+1).equals(" े")) {
            word.delete(len , len + 1);
            return;
        }
        if (word.substring(len, len+1).equals(" ी")) {
            word.delete(len , len + 1);
            return;
        }
        if (word.substring(len, len+1).equals(" ो")) {
            word.delete(len , len + 1);
            return;
        }
        if (word.substring(len, len+1).equals("ि ")) {
            word.delete(len , len + 1);
            return;
        }
        if (word.substring(len, len+1).equals("अ")) {
            word.delete(len , len + 1);
            return;
        }

    } /* end if len > 2 */
    return;
}

公共类后置灯{
/**
*一堆单词及其词干
*/
静态私有映射缓存=new WeakHashMap（）；
/**
*当前词干的缓冲区
*/
私有StringBuilder sb=新StringBuilder（）；
/**
*默认构造函数
*/
公共后灯{
}
公共字符串词干（字符串字）{
字符串结果=cache.get（word）；
如果（结果！=null）
返回结果；
// 
sb.删除（0，sb.length（））；
// 
某人附加（字）；
/*从名词和形容词中去掉大小写结尾*/
去掉_后缀（sb）；
结果=sb.toString（）；
cache.put（字、结果）；
返回结果；
}
私有void删除_后缀（StringBuilder word）{
int len=word.length（）-1；
/*文章*/
如果（len>4）{
if（字.子串（len-2，len+1）.equals（“िया")) {
删除（len-2，len+1）；
返回；
}
if（字.子串（len-2，len+1）.equals（“ियो")) {
删除（len-2，len+1）；
返回；
}
}/*如果长度>4，则结束*/
如果（len>3）{
if（字子串（len-1，len+1）.equals（“ाए")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“ाओ")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“ुआ")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“ुओ")) {
删除（len-1，len+1）；
返回；
}
if（字的子串（len-1，len+1）.equals（“ये")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“ेन")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“ेण")) {
删除（len-1，len+1）；
返回；
}
if（字的子串（len-1，len+1）.equals（“ीय")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“टी")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“ार")) {
删除（len-1，len+1）；
返回；
}
if（字子串（len-1，len+1）.equals（“ाई")) {
删除（len-1，len+1）；
返回；
}
}/*如果长度>3，则结束*/
如果（len>2）{
if（字.子串（len，len+1）.equals（“ा")) {
删除（len，len+1）；
返回；
}
if（字.子串（len，len+1）.equals（“े")) {
删除（len，len+1）；
重新