Java—处理目录中每个文件的内容

Java—处理目录中每个文件的内容,java,file-io,stanford-nlp,Java,File Io,Stanford Nlp,我正在使用斯坦福CoreNLP对25000个单独的文本电影评论进行情感分析,所有评论都包含在一个目录中。为了做到这一点,我需要稍微修改斯坦福代码,因为它只分析单个文本文件中的每个句子 我的尝试如下: import java.io.File; import java.io.IOException; import java.nio.charset.Charset; import java.util.Iterator; import java.util.List; import java.util.M

我正在使用斯坦福CoreNLP对25000个单独的文本电影评论进行情感分析,所有评论都包含在一个目录中。为了做到这一点,我需要稍微修改斯坦福代码,因为它只分析单个文本文件中的每个句子

我的尝试如下:

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.FileUtils;

import com.google.common.io.Files;

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.util.Iterator;
import org.apache.commons.io.*;

/** A simple corenlp example ripped directly from the Stanford CoreNLP website using text from wikinews. */
public class sentimentMain {

  public static void main(String[] args) throws IOException {
    // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution 
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // read some text from the file..
    Iterator it = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\neg"), null, false);
    Iterator it1 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\pos"), null, false);
    Iterator it2 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\unsup"), null, false);

    File inputFile  = new File ((String) (it.next()));
    String text = Files.toString(inputFile, Charset.forName("UTF-8"));
    System.out.println(text);

    //File inputFile = new File("C:/stanford-corenlp-full-2016-10-31/input.txt");
    //String text = Files.toString(inputFile, Charset.forName("UTF-8"));

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the text of the token
        String word = token.get(TextAnnotation.class);
        // this is the POS tag of the token
        String pos = token.get(PartOfSpeechAnnotation.class);
        // this is the NER label of the token
        String ne = token.get(NamedEntityTagAnnotation.class);

        System.out.println("word: " + word + " pos: " + pos + " ne:" + ne);
      }

      // this is the parse tree of the current sentence
      Tree tree = sentence.get(TreeAnnotation.class);
      System.out.println("parse tree:\n" + tree);

      // this is the Stanford dependency graph of the current sentence
      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
      System.out.println("dependency graph:\n" + dependencies);
    }

    // This is the coreference link graph
    // Each chain stores a set of mentions that link to each other,
    // along with a method for getting the most representative mention
    // Both sentence and token offsets start at 1!
    Map<Integer, CorefChain> graph = 
        document.get(CorefChainAnnotation.class);

  }

}
我知道“it.next()”不能转换为字符串,但有人知道我可以确保文件内容作为字符串输入以进行处理的另一种方法吗


提前感谢:)

这是一个直截了当的编译错误,一个像样的IDE会告诉你的。 变量-“text”在while循环外部不可用,它应该在while循环开始之前声明,或者文档声明应该在while循环内部

请在下面找到已编辑的代码

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.FileUtils;

import com.google.common.io.Files;

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.util.Iterator;
import org.apache.commons.io.*;

/** A simple corenlp example ripped directly from the Stanford CoreNLP website using text from wikinews. */
public class sentimentMain {

  public static void main(String[] args) throws IOException {
    // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution 
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // read some text from the file..
    Iterator it = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\neg"), null, false);
    Iterator it1 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\pos"), null, false);
    Iterator it2 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\unsup"), null, false);

    while(it.hasNext()){

        File inputFile  = new File ((String) (it.next()));
        String text = Files.toString(inputFile, Charset.forName("UTF-8"));
        System.out.println(text);
    //File inputFile = new File("C:/stanford-corenlp-full-2016-10-31/input.txt");
    //String text = Files.toString(inputFile, Charset.forName("UTF-8"));

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the text of the token
        String word = token.get(TextAnnotation.class);
        // this is the POS tag of the token
        String pos = token.get(PartOfSpeechAnnotation.class);
        // this is the NER label of the token
        String ne = token.get(NamedEntityTagAnnotation.class);

        System.out.println("word: " + word + " pos: " + pos + " ne:" + ne);
      }

      // this is the parse tree of the current sentence
      Tree tree = sentence.get(TreeAnnotation.class);
      System.out.println("parse tree:\n" + tree);

      // this is the Stanford dependency graph of the current sentence
      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
      System.out.println("dependency graph:\n" + dependencies);
    }

    // This is the coreference link graph
    // Each chain stores a set of mentions that link to each other,
    // along with a method for getting the most representative mention
    // Both sentence and token offsets start at 1!
    Map<Integer, CorefChain> graph = 
        document.get(CorefChainAnnotation.class);

    }
  }

}
导入java.io.File;
导入java.io.IOException;
导入java.nio.charset.charset;
导入java.util.Iterator;
导入java.util.List;
导入java.util.Map;
导入java.util.Properties;
导入org.apache.commons.io.FileUtils;
导入com.google.common.io.Files;
导入edu.stanford.nlp.dcoref.CorefChain;
导入edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
导入edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
导入edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
导入edu.stanford.nlp.ling.CoreAnnotations.SentencesAnotation;
导入edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
导入edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
导入edu.stanford.nlp.ling.corelab;
导入edu.stanford.nlp.pipeline.Annotation;
导入edu.stanford.nlp.pipeline.StanfordCoreNLP;
导入edu.stanford.nlp.semgraph.SemanticGraph;
导入edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenceAnnotations;
导入edu.stanford.nlp.trees.Tree;
导入edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
导入edu.stanford.nlp.util.CoreMap;
导入java.io.File;
导入java.util.Iterator;
导入org.apache.commons.io.*;
/**一个简单的corenlp示例直接取自斯坦福大学corenlp网站,使用来自wikinews的文本*/
公众阶级{
公共静态void main(字符串[]args)引发IOException{
//创建一个StanfordCoreNLP对象,该对象具有词性标记、lemmatization、NER、解析和共指解析
Properties props=新属性();
props.put(“注释器”、“标记化、ssplit、pos、引理、ner、解析、dcoref”);
StanfordCoreNLP管道=新的StanfordCoreNLP(道具);
//从文件中读取一些文本。。
迭代器it=FileUtils.iterateFiles(新文件(“C:\\stanford-corenlp-full-2016-10-31\\train\\neg”),null,false);
迭代器it1=FileUtils.iterateFiles(新文件(“C:\\stanford-corenlp-full-2016-10-31\\train\\pos”),null,false);
迭代器it2=FileUtils.iterateFiles(新文件(“C:\\stanford-corenlp-full-2016-10-31\\train\\unsup”),null,false);
while(it.hasNext()){
File inputFile=新文件((字符串)(it.next());
字符串text=Files.toString(inputFile,Charset.forName(“UTF-8”);
System.out.println(文本);
//文件输入文件=新文件(“C:/stanford-corenlp-full-2016-10-31/input.txt”);
//字符串text=Files.toString(inputFile,Charset.forName(“UTF-8”);
//仅使用给定文本创建空注释
注释文档=新注释(文本);
//在此文本上运行所有注释器
管道注释(文件);
//这些是本文件中的所有句子
//CoreMap本质上是一个使用类对象作为键并具有自定义类型值的映射
列出句子=document.get(SentencesAnnotation.class);
for(CoreMap句子:句子){
//遍历当前句子中的单词
//CoreLabel是一个CoreMap,具有额外的令牌特定方法
for(CoreLabel标记:句子.get(TokensAnnotation.class)){
//这是令牌的文本
String word=token.get(TextAnnotation.class);
//这是令牌的POS标记
String pos=token.get(speechannotation.class的一部分);
//这是令牌的NER标签
字符串ne=token.get(NamedEntityTagAnnotation.class);
系统输出打印项次(“字:+字+”位置:+pos+“ne:+ne”);
}
//这是当前句子的解析树
Tree-Tree=句子.get(TreeAnnotation.class);
System.out.println(“解析树:\n”+树);
//这是当前句子的斯坦福依存关系图
SemanticGraph dependencies=句子.get(CollapsedCCProcessedDependenciesAnnotation.class);
System.out.println(“依赖关系图:\n”+依赖关系);
}
//这是共指链接图
//每个连锁店都存储一组相互链接的提及,
//以及获得最具代表性提及的方法
//句子和标记偏移量都从1开始!
映射图=
get(CorefChainAnnotation.class);
}
}
}

注释文档=新注释(文本)
您试图访问此处不在范围内的
文本
变量。您已经在
中定义了它,而(it.hasNext()){
循环。此代码修复了我遇到的问题,现在它不允许我强制转换它。next():线程“main”java.lang.ClassCastException中的异常:java.io.File不能强制转换为感伤main.main(感伤main.java:50)中的java.lang.String
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.commons.io.FileUtils;

import com.google.common.io.Files;

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations.CorefChainAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import java.io.File;
import java.util.Iterator;
import org.apache.commons.io.*;

/** A simple corenlp example ripped directly from the Stanford CoreNLP website using text from wikinews. */
public class sentimentMain {

  public static void main(String[] args) throws IOException {
    // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution 
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // read some text from the file..
    Iterator it = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\neg"), null, false);
    Iterator it1 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\pos"), null, false);
    Iterator it2 = FileUtils.iterateFiles(new File("C:\\stanford-corenlp-full-2016-10-31\\train\\unsup"), null, false);

    while(it.hasNext()){

        File inputFile  = new File ((String) (it.next()));
        String text = Files.toString(inputFile, Charset.forName("UTF-8"));
        System.out.println(text);
    //File inputFile = new File("C:/stanford-corenlp-full-2016-10-31/input.txt");
    //String text = Files.toString(inputFile, Charset.forName("UTF-8"));

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for(CoreMap sentence: sentences) {
      // traversing the words in the current sentence
      // a CoreLabel is a CoreMap with additional token-specific methods
      for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
        // this is the text of the token
        String word = token.get(TextAnnotation.class);
        // this is the POS tag of the token
        String pos = token.get(PartOfSpeechAnnotation.class);
        // this is the NER label of the token
        String ne = token.get(NamedEntityTagAnnotation.class);

        System.out.println("word: " + word + " pos: " + pos + " ne:" + ne);
      }

      // this is the parse tree of the current sentence
      Tree tree = sentence.get(TreeAnnotation.class);
      System.out.println("parse tree:\n" + tree);

      // this is the Stanford dependency graph of the current sentence
      SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
      System.out.println("dependency graph:\n" + dependencies);
    }

    // This is the coreference link graph
    // Each chain stores a set of mentions that link to each other,
    // along with a method for getting the most representative mention
    // Both sentence and token offsets start at 1!
    Map<Integer, CorefChain> graph = 
        document.get(CorefChainAnnotation.class);

    }
  }

}