Java OpenNLP训练泰语

Java OpenNLP训练泰语,java,nlp,opennlp,Java,Nlp,Opennlp,我正在使用OpenNLP1.7.2和maxent-3.0.0.jar进行泰语培训,下面是读取泰语培训数据并创建bin模型的代码 public class TrainPerson { public static void main(String[] args) throws IOException { String trainFile = "/Documents/workspace/ThaiOpenNLP/bin/thaiPerson.train"; String modelFil

我正在使用OpenNLP1.7.2和maxent-3.0.0.jar进行泰语培训,下面是读取泰语培训数据并创建bin模型的代码

public class TrainPerson {
public static void main(String[] args) throws IOException {
    String trainFile = "/Documents/workspace/ThaiOpenNLP/bin/thaiPerson.train";
    String modelFile = "/Documents/workspace/ThaiOpenNLP/bin/th-ner-person.bin"; 
    writePersonModel(trainFile, modelFile);

}
private static void writePersonModel(String trainFile, String modelFile)
        throws FileNotFoundException, IOException {

    Charset charset = Charset.forName("UTF-8");
    InputStreamFactory fileInputStream = new MarkableFileInputStreamFactory(new File(trainFile));
    ObjectStream<String> lineStream = new PlainTextByLineStream(fileInputStream, charset);
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
    TokenNameFinderModel model;

    try {
        model = NameFinderME.train("th", "person", sampleStream , TrainingParameters.defaultParams(), new TokenNameFinderFactory());
    } finally {
        sampleStream.close();
    }
    BufferedOutputStream modelOut = null;
    try {
        modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
        model.serialize(modelOut);

    } finally {
        if (modelOut != null) {
            modelOut.close();
        }
    }
}}
公共类列车员{
公共静态void main(字符串[]args)引发IOException{
字符串trainFile=“/Documents/workspace/ThaiOpenNLP/bin/thaiPerson.train”;
字符串modelFile=“/Documents/workspace/ThaiOpenNLP/bin/th ner person.bin”;
writePersonModel(trainFile,modelFile);
}
私有静态void writePersonModel(字符串trainFile,字符串modelFile)
抛出FileNotFoundException,IOException{
Charset Charset=Charset.forName(“UTF-8”);
InputStreamFactory文件InputStream=新标记文件InputStreamFactory(新文件(trainFile));
ObjectStream lineStream=新的明文ByLineStream(fileInputStream,字符集);
ObjectStream sampleStream=新名称采样数据流(lineStream);
TokenNameFinderModel模型;
试一试{
model=NameFinderME.train(“th”,“person”,sampleStream,TrainingParameters.defaultParams(),new-TokenNameFinderFactory());
}最后{
sampleStream.close();
}
BufferedOutputStream modelOut=null;
试一试{
modelOut=newbufferedoutputstream(newfileoutputstream(modelFile));
序列化(modelOut);
}最后{
if(modelOut!=null){
modelOut.close();
}
}
}}
泰国数据如文件中所附

我使用输出模型来检测人名,如下面的程序所示。它无法识别名称

public class ThaiPersonNameFinder {

static String modelFile = "/Users/avinashpaula/Documents/workspace/ThaiOpenNLP/bin/th-ner-person.bin";

public static void main(String[] args) {

    try {
        InputStream modelIn = new FileInputStream(new File(modelFile));
      TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
      NameFinderME nameFinder = new NameFinderME(model);
      String sentence[] = new String[]{
                "จอห์น",
                "30",
                "ปี",
                "จะ",
                "เข้าร่วม",
                "ก",
                "เริ่มต้น",
                "ขึ้น",
                "บน",
                "มกราคม",
                "."
                };

    Span nameSpans[] = nameFinder.find(sentence);
    for (int i = 0; i < nameSpans.length; i++) {
        System.out.println(nameSpans[i]);
    }
    }
    catch (IOException e) {
      e.printStackTrace();
    }
}
公共类ThaiPersonNameFinder{
静态字符串modelFile=“/Users/avinashpaula/Documents/workspace/ThaiOpenNLP/bin/th ner person.bin”;
公共静态void main(字符串[]args){
试一试{
InputStream modelIn=新文件InputStream(新文件(modelFile));
TokenNameFinderModel model=新的TokenNameFinderModel(modelIn);
NameFinderME nameFinder=新的NameFinderME(模型);
字符串语句[]=新字符串[]{
"จอห์น",
"30",
"ปี",
"จะ",
"เข้าร่วม",
"ก",
"เริ่มต้น",
"ขึ้น",
"บน",
"มกราคม",
"."
};
Span nameSpans[]=nameFinder.find(句子);
for(int i=0;i
}


我做错了什么。

在训练阶段你有任何输出或错误吗?关于你的语料库,你应该删除任何标签,如
,因为它将被当作泰语标记处理。我看不懂泰语,但我可以看到你多次重复同样的两句话。由于这个原因,这个训练数据不好。关于maxent-3.0.0.jar,您不需要它。OpenNLP包含它的所有依赖项。您在培训阶段有任何输出或错误吗?关于您的语料库,您应该删除任何标记,例如
,因为它将作为泰语标记处理。我看不懂泰语,但我可以看到您多次重复相同的两句话。由于t、 关于maxent-3.0.0.jar,您不需要它。OpenNLP包含它的所有依赖项。