Java 一个字符一个字符地读取一个巨大的文本文件需要很长时间
我正在使用Java 一个字符一个字符地读取一个巨大的文本文件需要很长时间,java,Java,我正在使用FileInputStream逐字符阅读一个9kb的Java文本文件,几乎需要一分钟的阅读时间。此性能是否良好,或者可以通过使用其他流如缓冲区并在内存中一次读取整个数据来优化此性能 // This method is used to read the Brown Corpus public void readBrownCorpus(String corpusPath) throws IOException { FileInputStream inputStream = null
FileInputStream
逐字符阅读一个9kb的Java文本文件,几乎需要一分钟的阅读时间。此性能是否良好,或者可以通过使用其他流
如缓冲区
并在内存中一次读取整个数据来优化此性能
// This method is used to read the Brown Corpus
public void readBrownCorpus(String corpusPath) throws IOException {
FileInputStream inputStream = null;
try {
inputStream = new FileInputStream(corpusPath);
int letter = 0; // denote current read letter
String previousTag = "^";
StringBuilder wordWithTag = new StringBuilder(); // denote the string which
while((letter = inputStream.read()) != -1) {
if(((char) letter) != ' ')
wordWithTag.append((char) letter);
else {
String word[] = wordWithTag.substring(0).split("_");
if(word != null && word.length != 2)
throw new Exception("Error in the Format of Corpus");
// If new tag found, insert this in both transitionTable and emissionTable
if(transitionTable.get(word[1]) == null) {
insertTagInTransitionTable(previousTag, word[1]);
}
updateTranstionTable(previousTag, word[1]);
updateEmissionTable(word[0], word[1]);
// update the previous Tag
if(word[1].equals("."))
previousTag = "^";
else
previousTag = word[1];
wordWithTag.setLength(0); //empty the wordWithTag for new word
System.out.println(transitionTable.size());
}
}
} catch(IOException ioException) {
ioException.printStackTrace();
} catch(Exception exception) {
exception.printStackTrace();
}
finally {
inputStream.close();
}
}
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package demo;
import java.util.*;
import java.io.*;
/**
*
* @author Jatin Khurana
*/
public class Main {
public HashMap<String,Row> transitionTable; // Transition Table
public HashMap<String,Row> emissionTable; // Emission Table
// Constructor
public Main()
{
transitionTable=new HashMap<String,Row>();
emissionTable=new HashMap<String,Row>();
prepareInitialTransitionTable();
//prepareInitialEmissionTable();
}
// This method prepare the initial transition Table
private void prepareInitialTransitionTable()
{
Row row1=new Row();
row1.tagCount.put("^", 0.0f);
row1.tagCount.put(".", 0f);
Row row2=new Row();
row2.tagCount.put("^", 0f);
row2.tagCount.put(".", 0f);
transitionTable.put("^", row1);
transitionTable.put(".", row2);
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) throws IOException{
Main m=new Main();
BufferedReader inputStream=null;
try
{
inputStream=new BufferedReader(new FileReader("d://postagger//corpus//brown.txt"));
String corpusData = inputStream.readLine();
String previousTag="^";
String wordWithTag[] = corpusData.split(" ");
for(int i=0;i<wordWithTag.length;i++)
{
String word[]=wordWithTag[i].split("_");
if(word!=null && word.length!=2)
throw new Exception("Error in the Format of Corpus");
// If new tag found,insert this in both transitionTable and emissionTable
if(m.transitionTable.get(word[1])==null)
{
m.insertTagInTransitionTable(previousTag,word[1]);
}
m.updateTranstionTable(previousTag,word[1]);
m.updateEmissionTable(word[0],word[1]);
// update the previous Tag
if(word[1].equals("."))
{
previousTag="^";
}
else
{
previousTag=word[1];
}
System.out.println(m.transitionTable.size());
}
}
catch(IOException ioException)
{
ioException.printStackTrace();
}
catch(Exception exception)
{
exception.printStackTrace();
}
finally
{
inputStream.close();
}
}
private void insertTagInTransitionTable(String previousTag,String newTag) throws CloneNotSupportedException
{
Row row = (Row)transitionTable.get(previousTag);
row.tagCount.put(newTag,0f);
Row newRow=new Row();
transitionTable.put(newTag, newRow);
}
// This method is used to update the transitionTable
private void updateTranstionTable(String previousTag,String currentTag)
{
Row row = transitionTable.get(previousTag);
if(row.tagCount.get(currentTag)==null)
{
row.tagCount.put(currentTag, 1f);
}
else
{
row.tagCount.put(currentTag, row.tagCount.get(currentTag)+1);
}
}
// This method is used to update the emission table
private void updateEmissionTable(String word,String tag)
{
Row row = emissionTable.get(word);
if(row==null)
{
Row newRow=new Row();
newRow.tagCount.put(tag, 1f);
emissionTable.put(word, newRow);
}
else
{
if(row.tagCount.get(tag)==null)
{
row.tagCount.put(tag, 1f);
}
else
{
row.tagCount.put(tag,row.tagCount.get(tag)+1);
}
}
}
}
//此方法用于读取布朗语料库
public void readBrownCorpus(字符串微粒路径)引发IOException{
FileInputStream inputStream=null;
试一试{
inputStream=新文件inputStream(微粒路径);
int letter=0;//表示当前读取的字母
字符串previousTag=“^”;
StringBuilder wordWithTag=new StringBuilder();//表示
while((letter=inputStream.read())!=-1){
如果(((字符)字母)!=“”)
附加((字符)字母);
否则{
字符串字[]=wordWithTag.substring(0.split(“”);
if(word!=null&&word.length!=2)
抛出新异常(“语料库格式错误”);
//如果找到新标记,则将其插入transitionTable和emissionTable中
if(transitionTable.get(word[1])==null){
InsertTagUntransitiontable(以前的标签,单词[1]);
}
UpdateTransitionTable(先前的标签,单词[1]);
updateEmissionTable(字[0],字[1]);
//更新上一个标记
if(字[1]。等于(“.”)
previousTag=“^”;
其他的
previousTag=单词[1];
setLength(0);//为新单词清空wordWithTag
System.out.println(transitionTable.size());
}
}
}捕获(IOException IOException){
ioException.printStackTrace();
}捕获(异常){
异常。printStackTrace();
}
最后{
inputStream.close();
}
}
/*
*要更改此模板,请选择工具|模板
*然后在编辑器中打开模板。
*/
包装演示;
导入java.util.*;
导入java.io.*;
/**
*
*@作者Jatin Khurana
*/
公共班机{
公共HashMap transitionTable;//转换表
public HashMap emissionTable;//emissionTable
//建造师
公用干管()
{
transitionTable=新HashMap();
emissionTable=新HashMap();
准备初始转换表();
//prepareInitialEmissionTable();
}
//此方法用于准备初始过渡表
私有无效准备初始转换表()
{
行row1=新行();
行1.tagCount.put(“^”,0.0f);
行1.tagCount.put(“.”,0f);
行row2=新行();
行2.tagCount.put(“^”,0f);
行2.tagCount.put(“.”,0f);
可传递。放置(“^”,第1行);
可传递。放置(“.”,第2行);
}
/**
*@param指定命令行参数
*/
公共静态void main(字符串[]args)引发IOException{
Main m=新的Main();
BufferedReader inputStream=null;
尝试
{
inputStream=new BufferedReader(新文件阅读器(“d://postager//corpus//brown.txt”);
字符串corpusData=inputStream.readLine();
字符串previousTag=“^”;
字符串wordWithTag[]=corpusData.split(“”);
对于(int i=0;i)用一个缓冲区的INSUBFITCH流包FieldIdPoint进行简单的快速修复,然后考虑使用RealLINE()。你能给我们看一下你的代码吗?是的,我正在更新这个问题9KB远不是一个大文件。即使一次只读取一个字符也不会花费太长时间。我怀疑你的插入和更新方法效率低下,而且耗时太长。对于需要改进的工作代码,请继续问。堆栈溢出是为了让代码正常工作。@Alex sorry文件大小为9684 KB…请解释。很难相信这是没有代码、解释或基础的“解决方案”。使用缓冲区读取器编辑代码…显示net beans id需要1分12秒。。。。