Java 使用Stanford CoreNLP进行CorefResolution

Java 使用Stanford CoreNLP进行CorefResolution,java,nlp,stanford-nlp,Java,Nlp,Stanford Nlp,我正在尝试使用Stanford CoreNLP执行Coref解析。我使用的版本是stanford-corenlp-full-2015-12-09。基本上,我写了一些课程: import edu.stanford.nlp.dcoref.CorefChain; import edu.stanford.nlp.dcoref.CorefCoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nl

我正在尝试使用Stanford CoreNLP执行Coref解析。我使用的版本是stanford-corenlp-full-2015-12-09。基本上,我写了一些课程:

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;


public class CorefResolution {
    public static String corefResolute(String text, List<String> tokenToReplace) {
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        Annotation doc = new Annotation(text);
        pipeline.annotate(doc);

        Map<Integer, CorefChain> corefs = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        System.out.println(corefs);
        List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
        List<String> resolved = new ArrayList<String>();

        for (CoreMap sentence : sentences) {
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);

            for (CoreLabel token : tokens) {

                Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
                token.get(Coref)

                if (corefClustId == null) {
                    System.out.println("NULL NULL NULL\n");
                    resolved.add(token.word());
                    continue;
                }
                else {
                    System.out.println("Exist Exist Exist\n");
                }

                System.out.println("coreClustId is "+corefClustId.toString()+"\n");
                CorefChain chain = corefs.get(corefClustId);

                if (chain == null || chain.getMentionsInTextualOrder().size() == 1) {
                    resolved.add(token.word());
                } else {
                    int sentINdx = chain.getRepresentativeMention().sentNum - 1;
                    CoreMap corefSentence = sentences.get(sentINdx);
                    List<CoreLabel> corefSentenceTokens = corefSentence.get(CoreAnnotations.TokensAnnotation.class);

                    CorefChain.CorefMention reprMent = chain.getRepresentativeMention();

                    if (tokenToReplace.contains(token.word())) {
                        for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
                            CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
                            resolved.add(matchedLabel.word());
                        }
                    } else {
                        resolved.add(token.word());
                    }
                }
            }
        }

        Detokenizer detokenizer = new Detokenizer();
        String resolvedStr = detokenizer.detokenize(resolved);

        return resolvedStr;
    }
}
导入edu.stanford.nlp.dcoref.CorefChain;
导入edu.stanford.nlp.dcoref.CorefCoreAnnotations;
导入edu.stanford.nlp.ling.core注释;
导入edu.stanford.nlp.ling.corelab;
导入edu.stanford.nlp.pipeline.Annotation;
导入edu.stanford.nlp.pipeline.StanfordCoreNLP;
导入edu.stanford.nlp.util.CoreMap;
导入java.util.ArrayList;
导入java.util.List;
导入java.util.Map;
导入java.util.Properties;
公共类协同解决方案{
公共静态字符串corefResolute(字符串文本,列表标记替换){
Properties props=新属性();
props.put(“注释器”、“标记化、ssplit、pos、引理、ner、解析、dcoref”);
StanfordCoreNLP管道=新的StanfordCoreNLP(道具);
注释单据=新注释(文本);
管道注释(doc);
Map corefs=doc.get(corefcoreanotations.CorefChainAnnotation.class);
System.out.println(corefs);
列出句子=doc.get(coreanotations.SentencesAnnotation.class);
已解析列表=新建ArrayList();
for(CoreMap句子:句子){
List tokens=句子.get(coreanotations.TokensAnnotation.class);
for(CoreLabel令牌:令牌){
整数corefClustId=token.get(corefcoreeannotations.CorefClusterIdAnnotation.class);
token.get(Coref)
if(corefClustId==null){
System.out.println(“NULL\n”);
已解决。添加(token.word());
继续;
}
否则{
System.out.println(“存在\n”);
}
System.out.println(“coreClustId是“+corefClustId.toString()+”\n”);
CorefChain-chain=corefs.get(corefClustId);
if(chain==null | | chain.getReferencesIntextualorder().size()==1){
已解决。添加(token.word());
}否则{
int sentINdx=chain.getRepresentativementation().sentNum-1;
CoreMap corefSentence=句子.get(sentINdx);
列出corefSentenceTokens=corefSentence.get(coreeannotations.TokensAnnotation.class);
CorefChain.corefmotion reprMent=chain.getRepresentativement();
if(tokenToReplace.contains(token.word())){
for(int i=reprMent.startIndex;i
另一类

import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;


public class Detokenizer {

    public String detokenize(List<String> tokens) {
        //Define list of punctuation characters that should NOT have spaces before or after
        List<String> noSpaceBefore = new LinkedList<String>(Arrays.asList(",", ".",";", ":", ")", "}", "]", "'", "'s", "n't"));
        List<String> noSpaceAfter = new LinkedList<String>(Arrays.asList("(", "[","{", "\"",""));

        StringBuilder sentence = new StringBuilder();

        tokens.add(0, "");  //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
        for (int i = 1; i < tokens.size(); i++) {
            if (noSpaceBefore.contains(tokens.get(i))
                    || noSpaceAfter.contains(tokens.get(i - 1))) {
                sentence.append(tokens.get(i));
            } else {
                sentence.append(" " + tokens.get(i));
            }

            // Assumption that opening double quotes are always followed by matching closing double quotes
            // This block switches the " to the other set after each occurrence
            // ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
            if ("\"".equals(tokens.get(i - 1))) {
                if (noSpaceAfter.contains("\"")) {
                    noSpaceAfter.remove("\"");
                    noSpaceBefore.add("\"");
                } else {
                    noSpaceAfter.add("\"");
                    noSpaceBefore.remove("\"");
                }
            }
        }
        return sentence.toString();
    }
}
导入java.util.array;
导入java.util.List;
导入java.util.LinkedList;
公营脱蛋机{
公共字符串detokenize(列出标记){
//定义标点符号列表,这些标点符号的前后不应有空格
List noSpaceBefore=newlinkedlist(Arrays.asList(“,”,“,”;”,“:“,”,”,“}“,”],“,”,“,“'s”,“n't”);
List noSpaceAfter=newlinkedlist(Arrays.asList(“(”,“[”,“{”,“\”,”);
StringBuilder语句=新建StringBuilder();
tokens.add(0,“”;//在开始处添加一个空标记,因为循环检查为位置-1,“”在noSpaceAfter中
for(inti=1;i
另一个类文件

import java.io.*;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;


public class PlainTextCorefResolver {

    public static void resolveFile(File inputFile, File outputFile) {
        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")));
            PrintWriter writer = new PrintWriter(outputFile, "UTF-8");


            if (inputFile.exists()) System.out.println("input exist\n");
            else System.out.println("input not exist\n");

            if (outputFile.exists()) System.out.println("output exist\n");
            else System.out.println("output not exist\n");

            while(true){
                String line = reader.readLine();
                //EOF
                if(line == null)
                    break;
                //Resolve line
                List<String> tokenToReplace = Arrays.asList("He", "he", "She", "she", "It", "it", "They", "they"); //!!!
                String resolvedLine = CorefResolution.corefResolute(line, tokenToReplace);
                writer.println(resolvedLine);
            }
            reader.close();
            writer.close();

        } catch (Exception e){
            System.err.println("Failed to open/resolve input file [" +inputFile.getAbsoluteFile()+ "] in loader");
            e.printStackTrace();
            return;
        }

    }


    public static void main(String[] args) {
        String inputFileName = "path/file.txt";
        String outputFileName =  "path/file.resolved.txt";
        File inputFile = new File(inputFileName);
        File outputFile = new File(outputFileName);
        resolveFile(inputFile, outputFile);
    }

}
import java.io.*;
导入java.nio.charset.charset;
导入java.util.array;
导入java.util.List;
公共类PlainTextCorefResolver{
公共静态void解析文件(文件inputFile、文件outputFile){
试一试{
BufferedReader=新的BufferedReader(新的InputStreamReader(新文件InputStream(inputFile),Charset.forName(“UTF-8”));
PrintWriter writer=新的PrintWriter(输出文件,“UTF-8”);
if(inputFile.exists())System.out.println(“input exist\n”);
else System.out.println(“输入不存在\n”);
if(outputFile.exists())System.out.println(“output exist\n”);
else System.out.println(“输出不存在\n”);
while(true){
字符串行=reader.readLine();
//EOF
如果(行==null)
打破
//解析线
List tokenToReplace=array.asList(“他”、“他”、“她”、“她”、“它”、“它”、“它们”、“它们”);/!!!
String resolvedLine=CorefResolution.corefResolute(行,标记替换);
writer.println(resolvedLine);
}
阅读器关闭(
import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
import edu.stanford.nlp.hcoref.data.CorefChain;
import edu.stanford.nlp.hcoref.data.Mention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;

public class CorefExample {

    public static void main(String[] args) throws Exception {

        Annotation document = new Annotation("John Kerry is the secretary of state.  He ran for president in 2004.");
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,mention,coref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        pipeline.annotate(document);
        System.out.println("---");
        System.out.println("coref chains");
        for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
            System.out.println("\t"+cc);
            System.out.println(cc.getMentionMap());
            List<CorefChain.CorefMention> corefMentions = cc.getMentionsInTextualOrder();
            for (CorefChain.CorefMention cm : corefMentions) {
                System.out.println("---");
                System.out.println("full text: "+cm.mentionSpan);
                System.out.println("position: "+cm.position);
                System.out.println("start index of first word: "+cm.startIndex);
            }
        }
        for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
            System.out.println("---");
            System.out.println("mentions");
            for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
                System.out.println("\t"+m);
            }
        }
    }
}
INFO: Read 25 rules
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator mention
Using mention detector type: rule
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator coref
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
    at java.util.Arrays.copyOfRange(Arrays.java:3664)
    at java.lang.String.<init>(String.java:207)
    at java.lang.StringBuilder.toString(StringBuilder.java:407)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3079)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:2874)
    at java.io.ObjectInputStream.readString(ObjectInputStream.java:1639)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1342)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at java.util.HashMap.readObject(HashMap.java:1394)
    at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1900)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1924)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at edu.stanford.nlp.io.IOUtils.readObjectFromURLOrClasspathOrFileSystem(IOUtils.java:324)
    at edu.stanford.nlp.scoref.SimpleLinearClassifier.<init>(SimpleLinearClassifier.java:30)
    at edu.stanford.nlp.scoref.PairwiseModel.<init>(PairwiseModel.java:75)
    at edu.stanford.nlp.scoref.PairwiseModel$Builder.build(PairwiseModel.java:57)
    at edu.stanford.nlp.scoref.ClusteringCorefSystem.<init>(ClusteringCorefSystem.java:31)
    at edu.stanford.nlp.scoref.StatisticalCorefSystem.fromProps(StatisticalCorefSystem.java:48)
    at edu.stanford.nlp.pipeline.CorefAnnotator.<init>(CorefAnnotator.java:66)
    at edu.stanford.nlp.pipeline.AnnotatorImplementations.coref(AnnotatorImplementations.java:220)
    at edu.stanford.nlp.pipeline.AnnotatorFactories$13.create(AnnotatorFactories.java:515)
    at edu.stanford.nlp.pipeline.AnnotatorPool.get(AnnotatorPool.java:85)
    at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(StanfordCoreNLP.java:375)

Process finished with exit code 1