Java 使用Stanford CoreNLP进行CorefResolution_Java_Nlp_Stanford Nlp

Java 使用Stanford CoreNLP进行CorefResolution

java nlp stanford-nlp

Java 使用Stanford CoreNLP进行CorefResolution,java,nlp,stanford-nlp,Java,Nlp,Stanford Nlp,我正在尝试使用Stanford CoreNLP执行Coref解析。我使用的版本是stanford-corenlp-full-2015-12-09。基本上，我写了一些课程： import edu.stanford.nlp.dcoref.CorefChain; import edu.stanford.nlp.dcoref.CorefCoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nl

我正在尝试使用Stanford CoreNLP执行Coref解析。我使用的版本是stanford-corenlp-full-2015-12-09。基本上，我写了一些课程：

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;


public class CorefResolution {
    public static String corefResolute(String text, List<String> tokenToReplace) {
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        Annotation doc = new Annotation(text);
        pipeline.annotate(doc);

        Map<Integer, CorefChain> corefs = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        System.out.println(corefs);
        List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
        List<String> resolved = new ArrayList<String>();

        for (CoreMap sentence : sentences) {
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);

            for (CoreLabel token : tokens) {

                Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
                token.get(Coref)

                if (corefClustId == null) {
                    System.out.println("NULL NULL NULL\n");
                    resolved.add(token.word());
                    continue;
                }
                else {
                    System.out.println("Exist Exist Exist\n");
                }

                System.out.println("coreClustId is "+corefClustId.toString()+"\n");
                CorefChain chain = corefs.get(corefClustId);

                if (chain == null || chain.getMentionsInTextualOrder().size() == 1) {
                    resolved.add(token.word());
                } else {
                    int sentINdx = chain.getRepresentativeMention().sentNum - 1;
                    CoreMap corefSentence = sentences.get(sentINdx);
                    List<CoreLabel> corefSentenceTokens = corefSentence.get(CoreAnnotations.TokensAnnotation.class);

                    CorefChain.CorefMention reprMent = chain.getRepresentativeMention();

                    if (tokenToReplace.contains(token.word())) {
                        for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
                            CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
                            resolved.add(matchedLabel.word());
                        }
                    } else {
                        resolved.add(token.word());
                    }
                }
            }
        }

        Detokenizer detokenizer = new Detokenizer();
        String resolvedStr = detokenizer.detokenize(resolved);

        return resolvedStr;
    }
}

导入edu.stanford.nlp.dcoref.CorefChain；
导入edu.stanford.nlp.dcoref.CorefCoreAnnotations；
导入edu.stanford.nlp.ling.core注释；
导入edu.stanford.nlp.ling.corelab；
导入edu.stanford.nlp.pipeline.Annotation；
导入edu.stanford.nlp.pipeline.StanfordCoreNLP；
导入edu.stanford.nlp.util.CoreMap；
导入java.util.ArrayList；
导入java.util.List；
导入java.util.Map；
导入java.util.Properties；
公共类协同解决方案{
公共静态字符串corefResolute（字符串文本，列表标记替换）{
Properties props=新属性（）；
props.put（“注释器”、“标记化、ssplit、pos、引理、ner、解析、dcoref”）；
StanfordCoreNLP管道=新的StanfordCoreNLP（道具）；
注释单据=新注释（文本）；
管道注释（doc）；
Map corefs=doc.get（corefcoreanotations.CorefChainAnnotation.class）；
System.out.println（corefs）；
列出句子=doc.get（coreanotations.SentencesAnnotation.class）；
已解析列表=新建ArrayList（）；
for（CoreMap句子：句子）{
List tokens=句子.get（coreanotations.TokensAnnotation.class）；
for（CoreLabel令牌：令牌）{
整数corefClustId=token.get（corefcoreeannotations.CorefClusterIdAnnotation.class）；
token.get（Coref）
if（corefClustId==null）{
System.out.println（“NULL\n”）；
已解决。添加（token.word（））；
继续；
}
否则{
System.out.println（“存在\n”）；
}
System.out.println（“coreClustId是“+corefClustId.toString（）+”\n”）；
CorefChain-chain=corefs.get（corefClustId）；
if（chain==null | | chain.getReferencesIntextualorder（）.size（）==1）{
已解决。添加（token.word（））；
}否则{
int sentINdx=chain.getRepresentativementation（）.sentNum-1；
CoreMap corefSentence=句子.get（sentINdx）；
列出corefSentenceTokens=corefSentence.get（coreeannotations.TokensAnnotation.class）；
CorefChain.corefmotion reprMent=chain.getRepresentativement（）；
if（tokenToReplace.contains（token.word（）））{
for（int i=reprMent.startIndex；i


另一类
import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;


public class Detokenizer {

    public String detokenize(List<String> tokens) {
        //Define list of punctuation characters that should NOT have spaces before or after
        List<String> noSpaceBefore = new LinkedList<String>(Arrays.asList(",", ".",";", ":", ")", "}", "]", "'", "'s", "n't"));
        List<String> noSpaceAfter = new LinkedList<String>(Arrays.asList("(", "[","{", "\"",""));

        StringBuilder sentence = new StringBuilder();

        tokens.add(0, "");  //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
        for (int i = 1; i < tokens.size(); i++) {
            if (noSpaceBefore.contains(tokens.get(i))
                    || noSpaceAfter.contains(tokens.get(i - 1))) {
                sentence.append(tokens.get(i));
            } else {
                sentence.append(" " + tokens.get(i));
            }

            // Assumption that opening double quotes are always followed by matching closing double quotes
            // This block switches the " to the other set after each occurrence
            // ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
            if ("\"".equals(tokens.get(i - 1))) {
                if (noSpaceAfter.contains("\"")) {
                    noSpaceAfter.remove("\"");
                    noSpaceBefore.add("\"");
                } else {
                    noSpaceAfter.add("\"");
                    noSpaceBefore.remove("\"");
                }
            }
        }
        return sentence.toString();
    }
}

导入java.util.array；
导入java.util.List；
导入java.util.LinkedList；
公营脱蛋机{
公共字符串detokenize（列出标记）{
//定义标点符号列表，这些标点符号的前后不应有空格
List noSpaceBefore=newlinkedlist（Arrays.asList（“，”，“，”；”，“：“，”，”，“}“，”]，“，”，“，“'s”，“n't”）；
List noSpaceAfter=newlinkedlist（Arrays.asList（“（”，“[”，“{”，“\”，”）；
StringBuilder语句=新建StringBuilder（）；
tokens.add（0，“”；//在开始处添加一个空标记，因为循环检查为位置-1，“”在noSpaceAfter中
for（inti=1；i

另一个类文件
import java.io.*;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;


public class PlainTextCorefResolver {

    public static void resolveFile(File inputFile, File outputFile) {
        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")));
            PrintWriter writer = new PrintWriter(outputFile, "UTF-8");


            if (inputFile.exists()) System.out.println("input exist\n");
            else System.out.println("input not exist\n");

            if (outputFile.exists()) System.out.println("output exist\n");
            else System.out.println("output not exist\n");

            while(true){
                String line = reader.readLine();
                //EOF
                if(line == null)
                    break;
                //Resolve line
                List<String> tokenToReplace = Arrays.asList("He", "he", "She", "she", "It", "it", "They", "they"); //!!!
                String resolvedLine = CorefResolution.corefResolute(line, tokenToReplace);
                writer.println(resolvedLine);
            }
            reader.close();
            writer.close();

        } catch (Exception e){
            System.err.println("Failed to open/resolve input file [" +inputFile.getAbsoluteFile()+ "] in loader");
            e.printStackTrace();
            return;
        }

    }


    public static void main(String[] args) {
        String inputFileName = "path/file.txt";
        String outputFileName =  "path/file.resolved.txt";
        File inputFile = new File(inputFileName);
        File outputFile = new File(outputFileName);
        resolveFile(inputFile, outputFile);
    }

}

import java.io.*；
导入java.nio.charset.charset；
导入java.util.array；
导入java.util.List；
公共类PlainTextCorefResolver{
公共静态void解析文件（文件inputFile、文件outputFile）{
试一试{
BufferedReader=新的BufferedReader（新的InputStreamReader（新文件InputStream（inputFile），Charset.forName（“UTF-8”））；
PrintWriter writer=新的PrintWriter（输出文件，“UTF-8”）；
if（inputFile.exists（））System.out.println（“input exist\n”）；
else System.out.println（“输入不存在\n”）；
if（outputFile.exists（））System.out.println（“output exist\n”）；
else System.out.println（“输出不存在\n”）；
while（true）{
字符串行=reader.readLine（）；
//EOF
如果（行==null）
打破
//解析线
List tokenToReplace=array.asList（“他”、“他”、“她”、“她”、“它”、“它”、“它们”、“它们”）；/！！！
String resolvedLine=CorefResolution.corefResolute（行，标记替换）；
writer.println（resolvedLine）；
}
阅读器关闭(
import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
import edu.stanford.nlp.hcoref.data.CorefChain;
import edu.stanford.nlp.hcoref.data.Mention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;

public class CorefExample {

    public static void main(String[] args) throws Exception {

        Annotation document = new Annotation("John Kerry is the secretary of state.  He ran for president in 2004.");
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,mention,coref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        pipeline.annotate(document);
        System.out.println("---");
        System.out.println("coref chains");
        for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
            System.out.println("\t"+cc);
            System.out.println(cc.getMentionMap());
            List<CorefChain.CorefMention> corefMentions = cc.getMentionsInTextualOrder();
            for (CorefChain.CorefMention cm : corefMentions) {
                System.out.println("---");
                System.out.println("full text: "+cm.mentionSpan);
                System.out.println("position: "+cm.position);
                System.out.println("start index of first word: "+cm.startIndex);
            }
        }
        for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
            System.out.println("---");
            System.out.println("mentions");
            for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
                System.out.println("\t"+m);
            }
        }
    }
}

INFO: Read 25 rules
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator mention
Using mention detector type: rule
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator coref
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
    at java.util.Arrays.copyOfRange(Arrays.java:3664)
    at java.lang.String.<init>(String.java:207)
    at java.lang.StringBuilder.toString(StringBuilder.java:407)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3079)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:2874)
    at java.io.ObjectInputStream.readString(ObjectInputStream.java:1639)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1342)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at java.util.HashMap.readObject(HashMap.java:1394)
    at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1900)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1924)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at edu.stanford.nlp.io.IOUtils.readObjectFromURLOrClasspathOrFileSystem(IOUtils.java:324)
    at edu.stanford.nlp.scoref.SimpleLinearClassifier.<init>(SimpleLinearClassifier.java:30)
    at edu.stanford.nlp.scoref.PairwiseModel.<init>(PairwiseModel.java:75)
    at edu.stanford.nlp.scoref.PairwiseModel$Builder.build(PairwiseModel.java:57)
    at edu.stanford.nlp.scoref.ClusteringCorefSystem.<init>(ClusteringCorefSystem.java:31)
    at edu.stanford.nlp.scoref.StatisticalCorefSystem.fromProps(StatisticalCorefSystem.java:48)
    at edu.stanford.nlp.pipeline.CorefAnnotator.<init>(CorefAnnotator.java:66)
    at edu.stanford.nlp.pipeline.AnnotatorImplementations.coref(AnnotatorImplementations.java:220)
    at edu.stanford.nlp.pipeline.AnnotatorFactories$13.create(AnnotatorFactories.java:515)
    at edu.stanford.nlp.pipeline.AnnotatorPool.get(AnnotatorPool.java:85)
    at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(StanfordCoreNLP.java:375)

Process finished with exit code 1