Stanford nlp 如何为Stanford关系提取生成自定义训练数据

Stanford nlp 如何为Stanford关系提取生成自定义训练数据,stanford-nlp,Stanford Nlp,我已经训练了一个自定义分类器来理解金融领域中的命名实体。我想生成如下链接所示的自定义培训数据 我可以手工标记自定义关系,但我想先用自定义命名实体生成数据格式,如conll 我也用以下方法尝试了解析器,但这不会生成像链接中提到的Roth和Yih的数据那样的关系训练数据 java-mx150m-cp“stanford-parser-full-2013-06-20/*:”edu.stanford.nlp.parser.lexparser.LexicalizedParser-outputFormat“

我已经训练了一个自定义分类器来理解金融领域中的命名实体。我想生成如下链接所示的自定义培训数据

我可以手工标记自定义关系,但我想先用自定义命名实体生成数据格式,如conll

我也用以下方法尝试了解析器,但这不会生成像链接中提到的Roth和Yih的数据那样的关系训练数据

java-mx150m-cp“stanford-parser-full-2013-06-20/*:”edu.stanford.nlp.parser.lexparser.LexicalizedParser-outputFormat“penn”edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz stanford-parser-full-2013-06-20/data/testsent.txt>testsent.tree

java-mx150m-cp“stanford-parser-full-2013-06-20/*:”edu.stanford.nlp.trees.englishgrammaticstructure-treeFile testsent.tree-conllx

下面是自定义ner的输出,使用下面的python代码单独运行

'java -mx2g -cp "*" edu.stanford.nlp.ie.NERClassifierCombiner '\
                '-ner.model classifiers\custom-model.ser.gz '\
                'classifiers/english.all.3class.distsim.crf.ser.gz,'\
                'classifiers/english.conll.4class.distsim.crf.ser.gz,'\
                'classifiers/english.muc.7class.distsim.crf.ser.gz ' \
                '-textFile '+ outtxt_sent +  ' -outputFormat inlineXML  > ' + outtxt + '.ner'

output:

<PERSON>Charles Sinclair</PERSON> <DESG>Chairman</DESG> <ORGANIZATION>-LRB- age 68 -RRB- Charles was appointed a</ORGANIZATION> <DESG>non-executive director</DESG> <ORGANIZATION>in</ORGANIZATION>

自定义NER问题已解决。

此链接显示了一个数据示例:

我不认为有一种方法可以在斯坦福大学的CoreNLP中实现这一点

标记数据后,需要循环遍历句子并以相同的格式打印标记,包括词性标记和ner标记。似乎大多数列中都有一个“O”

对于每个具有关系的句子,您需要以关系格式打印出句子后面的a行。例如,这一行表示前一句有Live_In关系:

7    0    Live_In
下面是一些生成句子输出的示例代码。您需要通过将
ner.model
属性设置为自定义模型的路径来设置管道以使用
ner
模型。警告:此代码中可能存在一些错误,但它应该显示如何从StanfordCoreNLP数据结构访问所需的数据

package edu.stanford.nlp.examples;

import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.*;

import java.util.*;
import java.util.stream.Collectors;

public class CreateRelationData {

  public static void main(String[] args) {
    // set up pipeline properties
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions");
    // set up Stanford CoreNLP pipeline
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    // build annotation for a review
    Annotation annotation = new Annotation("Joe Smith lives in Hawaii.");
    pipeline.annotate(annotation);
    int sentNum = 0;
    for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      int tokenNum = 1;
      int elementNum = 0;
      int entityNum = 0;
      CoreMap currEntityMention = sentence.get(CoreAnnotations.MentionsAnnotation.class).get(entityNum);
      String currEntityMentionWords = currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.word()).
          collect(Collectors.joining("/"));
      String currEntityMentionTags =
          currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.tag()).
              collect(Collectors.joining("/"));
      String currEntityMentionNER = currEntityMention.get(CoreAnnotations.EntityTypeAnnotation.class);
      while (tokenNum <= sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
        if (currEntityMention.get(CoreAnnotations.TokensAnnotation.class).get(0).index() == tokenNum) {
          String entityText = currEntityMention.toString();
          System.out.println(sentNum+"\t"+currEntityMentionNER+"\t"+elementNum+"\t"+"O\t"+currEntityMentionTags+"\t"+
              currEntityMentionWords+"\t"+"O\tO\tO");
          // update tokenNum
          tokenNum += (currEntityMention.get(CoreAnnotations.TokensAnnotation.class).size());
          // update entity if there are remaining entities
          entityNum++;
          if (entityNum < sentence.get(CoreAnnotations.MentionsAnnotation.class).size()) {
            currEntityMention = sentence.get(CoreAnnotations.MentionsAnnotation.class).get(entityNum);
            currEntityMentionWords = currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.word()).
                collect(Collectors.joining("/"));
            currEntityMentionTags =
                currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.tag()).
                    collect(Collectors.joining("/"));
            currEntityMentionNER = currEntityMention.get(CoreAnnotations.EntityTypeAnnotation.class);
          }
        } else {
          CoreLabel token = sentence.get(CoreAnnotations.TokensAnnotation.class).get(tokenNum-1);
          System.out.println(sentNum+"\t"+token.ner()+"\t"+elementNum+"\tO\t"+token.tag()+"\t"+token.word()+"\t"+"O\tO\tO");
          tokenNum += 1;
        }
        elementNum += 1;
      }
      sentNum++;
    }
    System.out.println();
    System.out.println("O\t3\tLive_In");
  }
}
包edu.stanford.nlp.examples;
导入edu.stanford.nlp.ling.*;
导入edu.stanford.nlp.pipeline.*;
导入edu.stanford.nlp.trees.*;
导入edu.stanford.nlp.util.*;
导入java.util.*;
导入java.util.stream.collector;
公共类CreateRelationData{
公共静态void main(字符串[]args){
//设置管道属性
Properties props=新属性();
props.setProperty(“注释器”、“标记化、ssplit、pos、引理、ner、实体注释”);
//设置Stanford CoreNLP管道
StanfordCoreNLP管道=新的StanfordCoreNLP(道具);
//为审阅生成注释
注释=新注释(“乔·史密斯住在夏威夷”);
管道注释(注释);
int-sentNum=0;
for(CoreMap语句:annotation.get(coreanotations.SentencesAnnotation.class)){
int-tokenNum=1;
int elementNum=0;
int entityNum=0;
CoreMap currEntityMention=句子.get(CoreAnnotations.ReferencesAnotation.class).get(entityNum);
字符串currEntityMentionWords=currEntityMention.get(CoreAnnotations.TokensAnotation.class).stream().map(token->token.word())。
收集(收集器。连接(“/”);
字符串currentitymenttags=
currEntityMention.get(CoreAnnotations.TokensAnotation.class).stream().map(token->token.tag())。
收集(收集器。连接(“/”);
字符串currentITymentioner=currEntityMention.get(CoreAnnotations.EntityTypeAnnotation.class);
while(tokenNum token.word())。
收集(收集器。连接(“/”);
currentitymentags=
currEntityMention.get(CoreAnnotations.TokensAnotation.class).stream().map(token->token.tag())。
收集(收集器。连接(“/”);
currentITymentioner=currEntityMention.get(CoreAnnotations.EntityTypeAnnotation.class);
}
}否则{
CoreLabel token=句子.get(CoreAnnotations.TokensAnotation.class).get(tokenNum-1);
System.out.println(sentNum+“\t”+token.ner()+“\t”+elementNum+“\tO\t”+token.tag()+“\t”+token.word()+“\t”+“O\tO\tO”);
tokenNum+=1;
}
elementNum+=1;
}
sentNum++;
}
System.out.println();
System.out.println(“O\t3\tLive_In”);
}
}

StanfordNLPHelp感谢您发送的代码。它不会生成我训练过的自定义实体。我正在加载我的自定义分类器,如下面的props.setProperty(“ner.model”、“classifiers/custom-model.ser.gz、classifiers/english.all.3class.distsim.crf.ser.gz、classifiers/english.muc.7class.distsim.crf.ser.gz、classifiers/english.muc.7class.distsim.crf.ser.gz”);我单独测试了自定义分类器,它可以工作并理解学位(例如MBA),但在这里不工作。我们的目标是使用这些训练数据来训练一个定制的关系分类器。你能帮忙吗?ner工作正常吗?它是否为学位创建了一个实体?请在原始问题中添加一个不起作用的示例,并提供尽可能多的详细信息,包括您使用的代码。我已编辑了原始问题。另外,我还有一个问题,我想给财务总监加上标签,只有当他们一起出现时,而不是当财务单独出现时。示例财务总监但财务部我应该问另一个问题吗?另外,如果您使用的是统计模型,则无法指定它如何标记数据。如果您想要设置规则,您可能需要使用类似RegexNER的东西。这里有更多信息:我的代码中有一个bug,我没有更新当前的ITY接口。我在上面的答案中修正了这个问题。这将解决一切都是人的问题。
package edu.stanford.nlp.examples;

import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.*;

import java.util.*;
import java.util.stream.Collectors;

public class CreateRelationData {

  public static void main(String[] args) {
    // set up pipeline properties
    Properties props = new Properties();
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions");
    // set up Stanford CoreNLP pipeline
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    // build annotation for a review
    Annotation annotation = new Annotation("Joe Smith lives in Hawaii.");
    pipeline.annotate(annotation);
    int sentNum = 0;
    for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
      int tokenNum = 1;
      int elementNum = 0;
      int entityNum = 0;
      CoreMap currEntityMention = sentence.get(CoreAnnotations.MentionsAnnotation.class).get(entityNum);
      String currEntityMentionWords = currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.word()).
          collect(Collectors.joining("/"));
      String currEntityMentionTags =
          currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.tag()).
              collect(Collectors.joining("/"));
      String currEntityMentionNER = currEntityMention.get(CoreAnnotations.EntityTypeAnnotation.class);
      while (tokenNum <= sentence.get(CoreAnnotations.TokensAnnotation.class).size()) {
        if (currEntityMention.get(CoreAnnotations.TokensAnnotation.class).get(0).index() == tokenNum) {
          String entityText = currEntityMention.toString();
          System.out.println(sentNum+"\t"+currEntityMentionNER+"\t"+elementNum+"\t"+"O\t"+currEntityMentionTags+"\t"+
              currEntityMentionWords+"\t"+"O\tO\tO");
          // update tokenNum
          tokenNum += (currEntityMention.get(CoreAnnotations.TokensAnnotation.class).size());
          // update entity if there are remaining entities
          entityNum++;
          if (entityNum < sentence.get(CoreAnnotations.MentionsAnnotation.class).size()) {
            currEntityMention = sentence.get(CoreAnnotations.MentionsAnnotation.class).get(entityNum);
            currEntityMentionWords = currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.word()).
                collect(Collectors.joining("/"));
            currEntityMentionTags =
                currEntityMention.get(CoreAnnotations.TokensAnnotation.class).stream().map(token -> token.tag()).
                    collect(Collectors.joining("/"));
            currEntityMentionNER = currEntityMention.get(CoreAnnotations.EntityTypeAnnotation.class);
          }
        } else {
          CoreLabel token = sentence.get(CoreAnnotations.TokensAnnotation.class).get(tokenNum-1);
          System.out.println(sentNum+"\t"+token.ner()+"\t"+elementNum+"\tO\t"+token.tag()+"\t"+token.word()+"\t"+"O\tO\tO");
          tokenNum += 1;
        }
        elementNum += 1;
      }
      sentNum++;
    }
    System.out.println();
    System.out.println("O\t3\tLive_In");
  }
}