Java OpenNLP训练泰语_Java_Nlp_Opennlp

Java OpenNLP训练泰语

java nlp

Java OpenNLP训练泰语,java,nlp,opennlp,Java,Nlp,Opennlp,我正在使用OpenNLP1.7.2和maxent-3.0.0.jar进行泰语培训，下面是读取泰语培训数据并创建bin模型的代码 public class TrainPerson { public static void main(String[] args) throws IOException { String trainFile = "/Documents/workspace/ThaiOpenNLP/bin/thaiPerson.train"; String modelFil

我正在使用OpenNLP1.7.2和maxent-3.0.0.jar进行泰语培训，下面是读取泰语培训数据并创建bin模型的代码

public class TrainPerson {
public static void main(String[] args) throws IOException {
    String trainFile = "/Documents/workspace/ThaiOpenNLP/bin/thaiPerson.train";
    String modelFile = "/Documents/workspace/ThaiOpenNLP/bin/th-ner-person.bin"; 
    writePersonModel(trainFile, modelFile);

}
private static void writePersonModel(String trainFile, String modelFile)
        throws FileNotFoundException, IOException {

    Charset charset = Charset.forName("UTF-8");
    InputStreamFactory fileInputStream = new MarkableFileInputStreamFactory(new File(trainFile));
    ObjectStream<String> lineStream = new PlainTextByLineStream(fileInputStream, charset);
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
    TokenNameFinderModel model;

    try {
        model = NameFinderME.train("th", "person", sampleStream , TrainingParameters.defaultParams(), new TokenNameFinderFactory());
    } finally {
        sampleStream.close();
    }
    BufferedOutputStream modelOut = null;
    try {
        modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
        model.serialize(modelOut);

    } finally {
        if (modelOut != null) {
            modelOut.close();
        }
    }
}}

公共类列车员{
公共静态void main（字符串[]args）引发IOException{
字符串trainFile=“/Documents/workspace/ThaiOpenNLP/bin/thaiPerson.train”；
字符串modelFile=“/Documents/workspace/ThaiOpenNLP/bin/th ner person.bin”；
writePersonModel（trainFile，modelFile）；
}
私有静态void writePersonModel（字符串trainFile，字符串modelFile）
抛出FileNotFoundException，IOException{
Charset Charset=Charset.forName（“UTF-8”）；
InputStreamFactory文件InputStream=新标记文件InputStreamFactory（新文件（trainFile））；
ObjectStream lineStream=新的明文ByLineStream（fileInputStream，字符集）；
ObjectStream sampleStream=新名称采样数据流（lineStream）；
TokenNameFinderModel模型；
试一试{
model=NameFinderME.train（“th”，“person”，sampleStream，TrainingParameters.defaultParams（），new-TokenNameFinderFactory（））；
}最后{
sampleStream.close（）；
}
BufferedOutputStream modelOut=null；
试一试{
modelOut=newbufferedoutputstream（newfileoutputstream（modelFile））；
序列化（modelOut）；
}最后{
if（modelOut！=null）{
modelOut.close（）；
}
}
}}

泰国数据如文件中所附

我使用输出模型来检测人名，如下面的程序所示。它无法识别名称

public class ThaiPersonNameFinder {

static String modelFile = "/Users/avinashpaula/Documents/workspace/ThaiOpenNLP/bin/th-ner-person.bin";

public static void main(String[] args) {

    try {
        InputStream modelIn = new FileInputStream(new File(modelFile));
      TokenNameFinderModel model = new TokenNameFinderModel(modelIn);
      NameFinderME nameFinder = new NameFinderME(model);
      String sentence[] = new String[]{
                "จอห์น",
                "30",
                "ปี",
                "จะ",
                "เข้าร่วม",
                "ก",
                "เริ่มต้น",
                "ขึ้น",
                "บน",
                "มกราคม",
                "."
                };

    Span nameSpans[] = nameFinder.find(sentence);
    for (int i = 0; i < nameSpans.length; i++) {
        System.out.println(nameSpans[i]);
    }
    }
    catch (IOException e) {
      e.printStackTrace();
    }
}

公共类ThaiPersonNameFinder{
静态字符串modelFile=“/Users/avinashpaula/Documents/workspace/ThaiOpenNLP/bin/th ner person.bin”；
公共静态void main（字符串[]args）{
试一试{
InputStream modelIn=新文件InputStream（新文件（modelFile））；
TokenNameFinderModel model=新的TokenNameFinderModel（modelIn）；
NameFinderME nameFinder=新的NameFinderME（模型）；
字符串语句[]=新字符串[]{
"จอห์น",
"30",
"ปี",
"จะ",
"เข้าร่วม",
"ก",
"เริ่มต้น",
"ขึ้น",
"บน",
"มกราคม",
"."
};
Span nameSpans[]=nameFinder.find（句子）；
for（int i=0；i


}
我做错了什么。
在训练阶段你有任何输出或错误吗？关于你的语料库，你应该删除任何标签，如
，因为它将被当作泰语标记处理。我看不懂泰语，但我可以看到你多次重复同样的两句话。由于这个原因，这个训练数据不好。关于maxent-3.0.0.jar，您不需要它。OpenNLP包含它的所有依赖项。您在培训阶段有任何输出或错误吗？关于您的语料库，您应该删除任何标记，例如
，因为它将作为泰语标记处理。我看不懂泰语，但我可以看到您多次重复相同的两句话。由于t、 关于maxent-3.0.0.jar，您不需要它。OpenNLP包含它的所有依赖项。