在Java中拆分和合并大文件(大小以GB为单位)
假设在Java中拆分和合并大文件(大小以GB为单位),java,file,io,merge,split,Java,File,Io,Merge,Split,假设 我正在将2590400 KB(约2.5 GB)的文件拆分为30个部分 它将生成30个大小为86347KB的文件 这似乎是正确的,2590400/30=86346.666667 现在,如果我再次合并所有部分(30),它将生成3453873kb文件,该文件的大小应为2590410 KB 有谁能帮我解释为什么会有这种差异?我使用下面的代码合并和分割文件 SplitFile.java import java.io.BufferedOutputStream; import java.io.Buf
- 我正在将2590400 KB(约2.5 GB)的文件拆分为30个部分
- 它将生成30个大小为86347KB的文件 这似乎是正确的,2590400/30=86346.666667
- 现在,如果我再次合并所有部分(30),它将生成3453873kb文件,该文件的大小应为2590410 KB
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
/**
* @author vishal.zanzrukia
*
*/
public class SplitFile {
public static final String INPUT_FILE = "D:\\me\\projects\\input\\file\\path.txt";
public static final int NUMBER_OF_OUTPUT_FILES = 30;
public static final String FILE_SUFFIX = ".txt";
/**
* split file
*
* @throws Exception
*/
static void splitFile() throws Exception{
File inputFile = new File(INPUT_FILE + "_Splits");
inputFile.mkdir();
RandomAccessFile raf = new RandomAccessFile(INPUT_FILE, "r");
long sourceSize = raf.length();
long bytesPerSplit = sourceSize / NUMBER_OF_OUTPUT_FILES;
long remainingBytes = sourceSize % NUMBER_OF_OUTPUT_FILES;
int maxReadBufferSize = 8 * 1024; // 8KB
for (int destIx = 1; destIx <= NUMBER_OF_OUTPUT_FILES; destIx++) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\split." + destIx + FILE_SUFFIX));
if (bytesPerSplit > maxReadBufferSize) {
long numReads = bytesPerSplit / maxReadBufferSize;
long numRemainingRead = bytesPerSplit % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
} else {
readWrite(raf, bw, bytesPerSplit);
}
bw.close();
}
if (remainingBytes > 0) {
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split." + NUMBER_OF_OUTPUT_FILES + 1));
readWrite(raf, bw, remainingBytes);
bw.close();
}
raf.close();
}
/**
* join file
*
* @throws Exception
*/
static void joinFiles() throws Exception{
int maxReadBufferSize = 8 * 1024;
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX));
File inputFileDir = new File(INPUT_FILE + "_Splits");
RandomAccessFile raf = null;
if(inputFileDir.isDirectory()){
for(File file : inputFileDir.listFiles()){
raf = new RandomAccessFile(file, "r");
long numReads = raf.length() / maxReadBufferSize;
long numRemainingRead = raf.length() % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
raf.close();
}
}
bw.close();
}
public static void mergeFiles() {
File[] files = new File[NUMBER_OF_OUTPUT_FILES];
for(int i=1;i<=NUMBER_OF_OUTPUT_FILES;i++){
files[i-1] = new File(INPUT_FILE + "_Splits\\split."+i+FILE_SUFFIX);
}
String mergedFilePath = INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX;
File mergedFile = new File(mergedFilePath);
mergeFiles(files, mergedFile);
}
public static void mergeFiles(File[] files, File mergedFile) {
FileWriter fstream = null;
BufferedWriter out = null;
try {
fstream = new FileWriter(mergedFile, true);
out = new BufferedWriter(fstream);
} catch (IOException e1) {
e1.printStackTrace();
}
for (File f : files) {
System.out.println("merging: " + f.getName());
FileInputStream fis;
try {
fis = new FileInputStream(f);
BufferedReader in = new BufferedReader(new InputStreamReader(fis));
String aLine;
while ((aLine = in.readLine()) != null) {
out.write(aLine);
out.newLine();
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws Exception {
// splitFile();
mergeFiles();
}
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if (val != -1) {
bw.write(buf);
}
}
}
import java.io.BufferedOutputStream;
导入java.io.BufferedReader;
导入java.io.BufferedWriter;
导入java.io.File;
导入java.io.FileInputStream;
导入java.io.FileOutputStream;
导入java.io.FileWriter;
导入java.io.IOException;
导入java.io.InputStreamReader;
导入java.io.RandomAccessFile;
/**
*@作者vishal.zanzrukia
*
*/
公共类拆分文件{
公共静态最终字符串输入\u FILE=“D:\\me\\projects\\INPUT\\FILE\\path.txt”;
输出文件的公共静态最终整数=30;
公共静态最终字符串文件_SUFFIX=“.txt”;
/**
*分割文件
*
*@抛出异常
*/
静态void splitFile()引发异常{
文件输入文件=新文件(输入文件+“\u拆分”);
inputFile.mkdir();
RandomAccessFile raf=新的RandomAccessFile(输入_文件,“r”);
long sourceSize=raf.length();
long bytesPerSplit=源大小/输出文件的数量;
long remainingBytes=sourceSize%输出文件的数量;
int maxReadBufferSize=8*1024;//8KB
对于(int destIx=1;destIx maxReadBufferSize){
long numReads=字节过分割/maxReadBufferSize;
long numRemainingRead=bytesPerSplit%maxReadBufferSize;
对于(int i=0;i0){
读写(raf、bw、numRemainingRead);
}
}否则{
读写(raf、bw、bytesperplit);
}
bw.close();
}
如果(剩余字节数>0){
BufferedOutputStream bw=新的BufferedOutputStream(新文件输出流(“拆分”)+输出文件数+1);
读写(raf、bw、剩余字节);
bw.close();
}
raf.close();
}
/**
*连接文件
*
*@抛出异常
*/
静态void joinFiles()引发异常{
int maxReadBufferSize=8*1024;
BufferedOutputStream bw=新的BufferedOutputStream(新文件输出流(输入文件+“\u拆分\\fullJoin”+文件后缀));
File inputFileDir=新文件(输入文件+“\u拆分”);
RandomAccessFile raf=null;
if(inputFileDir.isDirectory()){
对于(文件:inputFileDir.listFiles()){
raf=新随机访问文件(文件“r”);
long numReads=raf.length()/maxReadBufferSize;
long numRemainingRead=raf.length()%maxReadBufferSize;
对于(int i=0;i0){
读写(raf、bw、numRemainingRead);
}
raf.close();
}
}
bw.close();
}
公共静态无效合并文件(){
File[]files=新文件[输出文件的数量];
对于(int i=1;i,问题是最后一行代码:
static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
byte[] buf = new byte[(int) numBytes];
int val = raf.read(buf);
if (val != -1) {
bw.write(buf);
}
}
写入时,您会回写数据的numBytes
,但read
函数具有:
读取到缓冲区的总字节数,如果由于已到达此文件的结尾而没有更多数据,则为-1
因此,您的解决方案是:
使用您的joinFiles
方法:如果您想让文件保持原样,请不要尝试使用Reader
逐行读取文件,因为行尾可能因平台而异
而是使用InputStream
或RandomAccessFile
将它们作为二进制文件读取,并使用OutputStream
写入
joinFiles
方法中唯一的问题是它使用了File.listFiles()
,这无法保证文件返回的顺序
我将您的mergeFiles()
代码与joinFiles()
组合在一起以实现此功能(请记住从main
方法调用joinFiles()
而不是mergeFiles()
)
static void joinFiles(文件[]文件)引发异常{
int maxReadBufferSize=8*1024;
BufferedOutputStream bw=新的BufferedOutputStream(新文件输出流)(输入文件+“\u拆分\\fullJoin”
+文件(后缀);
RandomAccessFile raf=null;
用于(文件:文件){
raf=新随机访问文件(文件“r”);
long numReads=raf.length()/maxReadBufferSize;
long numRemainingRead=raf.length()%maxReadBufferSize;
对于(int i=0;i0){
读写(raf、bw、numRemainingRead);
}
raf.close();
}
bw.close();
}
public static void joinFiles()引发异常{
File[]files=新文件[输出文件的数量];
对于(inti=1;很遗憾,我不可能用我的任何编辑器打开这么大的文件:(试试Glogg()或010Editor())…010Notepad打开所有内容…但需要付费,但试用版您应该尝试一个较小的测试用例-尝试具有类似内容的较小文件,或者只查看大文件的前10K。然后您可以自己检查输出,看看问题出在哪里。为什么突然读到b
bw.write(buf, 0 val);
static void joinFiles(File[] files) throws Exception {
int maxReadBufferSize = 8 * 1024;
BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin"
+ FILE_SUFFIX));
RandomAccessFile raf = null;
for (File file : files) {
raf = new RandomAccessFile(file, "r");
long numReads = raf.length() / maxReadBufferSize;
long numRemainingRead = raf.length() % maxReadBufferSize;
for (int i = 0; i < numReads; i++) {
readWrite(raf, bw, maxReadBufferSize);
}
if (numRemainingRead > 0) {
readWrite(raf, bw, numRemainingRead);
}
raf.close();
}
bw.close();
}
public static void joinFiles() throws Exception {
File[] files = new File[NUMBER_OF_OUTPUT_FILES];
for (int i = 1; i <= NUMBER_OF_OUTPUT_FILES; i++) {
files[i - 1] = new File(INPUT_FILE + "_Splits\\split." + i + FILE_SUFFIX);
}
joinFiles(files);
}