Java 无法更新打开的NLP模型_Java_Nlp_Opennlp

Java 无法更新打开的NLP模型

java nlp

Java 无法更新打开的NLP模型,java,nlp,opennlp,Java,Nlp,Opennlp,我正在为我的一个项目使用ApacheOpenNLP。我正在创建一个新模型来确定位置，因为预先训练的模型（en ner location.bin）没有这个位置代码如下： package com.equinox.nlp; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java

我正在为我的一个项目使用ApacheOpenNLP。我正在创建一个新模型来确定位置，因为预先训练的模型（en ner location.bin）没有这个位置

代码如下：

package com.equinox.nlp;

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;

public class NlpTesting {
protected Map<String, NameFinderME> finders;
protected Tokenizer tokenizer;

public static void main(String[] args) throws InvalidFormatException,
        IOException {

    String bankura = "In the 2011 census, Bankura municipality had a population of 138,036, out of which 70,734 were males and 67,302 were females.";
    String london = "London is the capital city of England and the United Kingdom.";

    NlpTesting nlpTesting = new NlpTesting();
    NameFinderME nameFinderA = nlpTesting.createNameFinder("./opennlp-models/en-ner-location.bin");
    nlpTesting.findLocation(london, nameFinderA);
    System.out.println("--------------------------");
    nlpTesting.findLocation(bankura, nameFinderA);

    nlpTesting.train();

    NameFinderME nameFinderB = nlpTesting.createNameFinder("./opennlp-models/en-ner-custom-location.bin");

    nlpTesting.findLocation(bankura, nameFinderB);
}

public String findLocation(String str,NameFinderME nameFinder) throws InvalidFormatException,
        IOException {
    String commaSeparatedLocationNames = "";
    tokenizer = SimpleTokenizer.INSTANCE;

    String tokens[] = tokenizer.tokenize(str);
    Span nameSpans[] = nameFinder.find(tokens);
    HashSet<String> locationSet = new HashSet<String>();
    for (int i = 0; i < nameSpans.length; i++) {
        locationSet.add(tokens[nameSpans[i].getStart()]);
    }
    for (Iterator<String> iterator = locationSet.iterator(); iterator
            .hasNext();) {
        String location = iterator.next();
        commaSeparatedLocationNames += location + ",";
    }
    System.out.println(commaSeparatedLocationNames);
    return commaSeparatedLocationNames;
}

public void train() throws IOException {
    File trainerFile = new File("./train/train.txt");
    File output = new File("./opennlp-models/en-ner-custom-location.bin");
    ObjectStream<String> lineStream = new PlainTextByLineStream(
            new FileInputStream(trainerFile), "UTF-8");
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
            lineStream);
    System.out.println("lineStream = " + lineStream);
    TokenNameFinderModel model = NameFinderME.train("en", "location",
            sampleStream, Collections.<String, Object> emptyMap());
    BufferedOutputStream modelOut = null;
    try {
        modelOut = new BufferedOutputStream(new FileOutputStream(output));
        model.serialize(modelOut);
    } finally {
        if (modelOut != null)
            modelOut.close();
    }
}

public NameFinderME createNameFinder(String str) throws InvalidFormatException,
        FileNotFoundException, IOException {
    NameFinderME nameFinder = new NameFinderME(new TokenNameFinderModel(
            new FileInputStream(new File(str))));
    return nameFinder;
}

package com.equinox.nlp；
导入java.io.BufferedOutputStream；
导入java.io.File；
导入java.io.FileInputStream；
导入java.io.FileNotFoundException；
导入java.io.FileOutputStream；
导入java.io.IOException；
导入java.util.Collections；
导入java.util.HashSet；
导入java.util.Iterator；
导入java.util.Map；
导入opennlp.tools.namefind.NameFinderME；
导入opennlp.tools.namefind.NameSample；
导入opennlp.tools.namefind.NameSampleDataStream；
导入opennlp.tools.namefind.TokenNameFinderModel；
导入opennlp.tools.tokenize.SimpleTokenizer；
导入opennlp.tools.tokenize.Tokenizer；
导入opennlp.tools.util.InvalidFormatException；
导入opennlp.tools.util.ObjectStream；
导入opennlp.tools.util.PlainTextByLineStream；
导入opennlp.tools.util.Span；
公开课{
受保护的地图搜寻者；
受保护的标记器标记器；
公共静态void main（字符串[]args）引发InvalidFormatException，
IOException{
String bankura=“在2011年人口普查中，bankura市人口为138036人，其中男性70734人，女性67302人。”；
String london=“伦敦是英国和英国的首都。”；
NlpTesting NlpTesting=新NlpTesting（）；
NameFinderME nameFinderA=nlpTesting.createNameFinder（“./opennlp models/en ner location.bin”）；
nlpTesting.findLocation（伦敦，名为Findera）；
System.out.println（“-----------------------------------”）；
nlpTesting.findLocation（班库拉，名为Findera）；
nlpTesting.train（）；
NameFinderME nameFinderB=nlpTesting.createNameFinder（“./opennlp models/en ner custom location.bin”）；
nlpTesting.findLocation（班库拉，名称FinderB）；
}
公共字符串查找位置（字符串str、NameFinderME nameFinder）引发InvalidFormatException，
IOException{
字符串commaSeparatedLocationNames=“”；
标记器=SimpleTokenizer.INSTANCE；
字符串标记[]=tokenizer.tokenize（str）；
Span nameSpans[]=nameFinder.find（令牌）；
HashSet locationSet=新HashSet（）；
for（int i=0；i


}
到目前为止，它运行良好
问题是我无法将其他位置添加到我创建的自定义模型中。
所以，我浏览了OpenNLP-README文档
在那里，它说，“注意：为了训练一个模型，你需要所有的训练数据。目前没有一种机制用额外的数据来更新与项目一起分发的模型。”
这是否意味着我也无法更新我的自定义模型？有没有办法做到这一点？很可能我在创建模型时没有所有的数据，并且应该有更新模型的选项。请帮助我。
它的意思就是：每次要添加新的训练实例时，都需要从头开始重新训练整个模型
如果您需要在不进行再培训的情况下更新模型，那么OpenNLP不是您完成任务的合适工具。
非常感谢。每次添加新的内容时，我都会保存所有的训练数据以生成新的模型。