在Java中拆分和合并大文件(大小以GB为单位)

在Java中拆分和合并大文件(大小以GB为单位),java,file,io,merge,split,Java,File,Io,Merge,Split,假设 我正在将2590400 KB(约2.5 GB)的文件拆分为30个部分 它将生成30个大小为86347KB的文件 这似乎是正确的,2590400/30=86346.666667 现在,如果我再次合并所有部分(30),它将生成3453873kb文件,该文件的大小应为2590410 KB 有谁能帮我解释为什么会有这种差异?我使用下面的代码合并和分割文件 SplitFile.java import java.io.BufferedOutputStream; import java.io.Buf

假设

  • 我正在将2590400 KB(约2.5 GB)的文件拆分为30个部分

  • 它将生成30个大小为86347KB的文件 这似乎是正确的,2590400/30=86346.666667

  • 现在,如果我再次合并所有部分(30),它将生成3453873kb文件,该文件的大小应为2590410 KB

有谁能帮我解释为什么会有这种差异?我使用下面的代码合并和分割文件

SplitFile.java

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;

/**
 * @author vishal.zanzrukia
 * 
 */
public class SplitFile {

    public static final String INPUT_FILE = "D:\\me\\projects\\input\\file\\path.txt";
    public static final int NUMBER_OF_OUTPUT_FILES = 30;
    public static final String FILE_SUFFIX = ".txt";

    /**
     * split file
     * 
     * @throws Exception
     */
    static void splitFile() throws Exception{

        File inputFile = new File(INPUT_FILE + "_Splits");
        inputFile.mkdir();

        RandomAccessFile raf = new RandomAccessFile(INPUT_FILE, "r");

        long sourceSize = raf.length();
        long bytesPerSplit = sourceSize / NUMBER_OF_OUTPUT_FILES;
        long remainingBytes = sourceSize % NUMBER_OF_OUTPUT_FILES;

        int maxReadBufferSize = 8 * 1024; // 8KB
        for (int destIx = 1; destIx <= NUMBER_OF_OUTPUT_FILES; destIx++) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\split." + destIx + FILE_SUFFIX));
            if (bytesPerSplit > maxReadBufferSize) {
                long numReads = bytesPerSplit / maxReadBufferSize;
                long numRemainingRead = bytesPerSplit % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
            } else {
                readWrite(raf, bw, bytesPerSplit);
            }
            bw.close();
        }
        if (remainingBytes > 0) {
            BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream("split." + NUMBER_OF_OUTPUT_FILES + 1));
            readWrite(raf, bw, remainingBytes);
            bw.close();
        }
        raf.close();
    }

    /**
     * join file
     * 
     * @throws Exception
     */
    static void joinFiles() throws Exception{
        int maxReadBufferSize = 8 * 1024; 

        BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX));
        File inputFileDir = new File(INPUT_FILE + "_Splits");
        RandomAccessFile raf = null;
        if(inputFileDir.isDirectory()){
            for(File file : inputFileDir.listFiles()){
                raf = new RandomAccessFile(file, "r");
                long numReads = raf.length() / maxReadBufferSize;
                long numRemainingRead = raf.length()  % maxReadBufferSize;
                for (int i = 0; i < numReads; i++) {
                    readWrite(raf, bw, maxReadBufferSize);
                }
                if (numRemainingRead > 0) {
                    readWrite(raf, bw, numRemainingRead);
                }
                raf.close();
            }
        }
        bw.close();
    }

    public static void mergeFiles() {

        File[] files = new File[NUMBER_OF_OUTPUT_FILES];
        for(int i=1;i<=NUMBER_OF_OUTPUT_FILES;i++){
            files[i-1] = new File(INPUT_FILE + "_Splits\\split."+i+FILE_SUFFIX);
        }

        String mergedFilePath = INPUT_FILE + "_Splits\\fullJoin" + FILE_SUFFIX;


        File mergedFile = new File(mergedFilePath);

        mergeFiles(files, mergedFile);
    }

    public static void mergeFiles(File[] files, File mergedFile) {

        FileWriter fstream = null;
        BufferedWriter out = null;
        try {
            fstream = new FileWriter(mergedFile, true);
             out = new BufferedWriter(fstream);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

        for (File f : files) {
            System.out.println("merging: " + f.getName());
            FileInputStream fis;
            try {
                fis = new FileInputStream(f);
                BufferedReader in = new BufferedReader(new InputStreamReader(fis));

                String aLine;
                while ((aLine = in.readLine()) != null) {
                    out.write(aLine);
                    out.newLine();
                }

                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        try {
            out.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    public static void main(String[] args) throws Exception {
//      splitFile();
        mergeFiles();
    }

    static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
        byte[] buf = new byte[(int) numBytes];
        int val = raf.read(buf);
        if (val != -1) {
            bw.write(buf);
        }
    }
}
import java.io.BufferedOutputStream;
导入java.io.BufferedReader;
导入java.io.BufferedWriter;
导入java.io.File;
导入java.io.FileInputStream;
导入java.io.FileOutputStream;
导入java.io.FileWriter;
导入java.io.IOException;
导入java.io.InputStreamReader;
导入java.io.RandomAccessFile;
/**
*@作者vishal.zanzrukia
* 
*/
公共类拆分文件{
公共静态最终字符串输入\u FILE=“D:\\me\\projects\\INPUT\\FILE\\path.txt”;
输出文件的公共静态最终整数=30;
公共静态最终字符串文件_SUFFIX=“.txt”;
/**
*分割文件
* 
*@抛出异常
*/
静态void splitFile()引发异常{
文件输入文件=新文件(输入文件+“\u拆分”);
inputFile.mkdir();
RandomAccessFile raf=新的RandomAccessFile(输入_文件,“r”);
long sourceSize=raf.length();
long bytesPerSplit=源大小/输出文件的数量;
long remainingBytes=sourceSize%输出文件的数量;
int maxReadBufferSize=8*1024;//8KB
对于(int destIx=1;destIx maxReadBufferSize){
long numReads=字节过分割/maxReadBufferSize;
long numRemainingRead=bytesPerSplit%maxReadBufferSize;
对于(int i=0;i0){
读写(raf、bw、numRemainingRead);
}
}否则{
读写(raf、bw、bytesperplit);
}
bw.close();
}
如果(剩余字节数>0){
BufferedOutputStream bw=新的BufferedOutputStream(新文件输出流(“拆分”)+输出文件数+1);
读写(raf、bw、剩余字节);
bw.close();
}
raf.close();
}
/**
*连接文件
* 
*@抛出异常
*/
静态void joinFiles()引发异常{
int maxReadBufferSize=8*1024;
BufferedOutputStream bw=新的BufferedOutputStream(新文件输出流(输入文件+“\u拆分\\fullJoin”+文件后缀));
File inputFileDir=新文件(输入文件+“\u拆分”);
RandomAccessFile raf=null;
if(inputFileDir.isDirectory()){
对于(文件:inputFileDir.listFiles()){
raf=新随机访问文件(文件“r”);
long numReads=raf.length()/maxReadBufferSize;
long numRemainingRead=raf.length()%maxReadBufferSize;
对于(int i=0;i0){
读写(raf、bw、numRemainingRead);
}
raf.close();
}
}
bw.close();
}
公共静态无效合并文件(){
File[]files=新文件[输出文件的数量];

对于(int i=1;i,问题是最后一行代码:

static void readWrite(RandomAccessFile raf, BufferedOutputStream bw, long numBytes) throws IOException {
    byte[] buf = new byte[(int) numBytes];
    int val = raf.read(buf);
    if (val != -1) {
        bw.write(buf);
    }
}
写入时,您会回写数据的
numBytes
,但
read
函数具有:

读取到缓冲区的总字节数,如果由于已到达此文件的结尾而没有更多数据,则为-1

因此,您的解决方案是:


使用您的
joinFiles
方法:如果您想让文件保持原样,请不要尝试使用
Reader
逐行读取文件,因为行尾可能因平台而异

而是使用
InputStream
RandomAccessFile
将它们作为二进制文件读取,并使用
OutputStream
写入

joinFiles
方法中唯一的问题是它使用了
File.listFiles()
,这无法保证文件返回的顺序

我将您的
mergeFiles()
代码与
joinFiles()
组合在一起以实现此功能(请记住从
main
方法调用
joinFiles()
而不是
mergeFiles()

static void joinFiles(文件[]文件)引发异常{
int maxReadBufferSize=8*1024;
BufferedOutputStream bw=新的BufferedOutputStream(新文件输出流)(输入文件+“\u拆分\\fullJoin”
+文件(后缀);
RandomAccessFile raf=null;
用于(文件:文件){
raf=新随机访问文件(文件“r”);
long numReads=raf.length()/maxReadBufferSize;
long numRemainingRead=raf.length()%maxReadBufferSize;
对于(int i=0;i0){
读写(raf、bw、numRemainingRead);
}
raf.close();
}
bw.close();
}
public static void joinFiles()引发异常{
File[]files=新文件[输出文件的数量];

对于(inti=1;很遗憾,我不可能用我的任何编辑器打开这么大的文件:(试试Glogg()或010Editor())…010Notepad打开所有内容…但需要付费,但试用版您应该尝试一个较小的测试用例-尝试具有类似内容的较小文件,或者只查看大文件的前10K。然后您可以自己检查输出,看看问题出在哪里。为什么突然读到b
bw.write(buf, 0 val);
static void joinFiles(File[] files) throws Exception {
    int maxReadBufferSize = 8 * 1024;

    BufferedOutputStream bw = new BufferedOutputStream(new FileOutputStream(INPUT_FILE + "_Splits\\fullJoin"
            + FILE_SUFFIX));

    RandomAccessFile raf = null;
    for (File file : files) {
        raf = new RandomAccessFile(file, "r");
        long numReads = raf.length() / maxReadBufferSize;
        long numRemainingRead = raf.length() % maxReadBufferSize;
        for (int i = 0; i < numReads; i++) {
            readWrite(raf, bw, maxReadBufferSize);
        }
        if (numRemainingRead > 0) {
            readWrite(raf, bw, numRemainingRead);
        }
        raf.close();

    }
    bw.close();
}

public static void joinFiles() throws Exception {

    File[] files = new File[NUMBER_OF_OUTPUT_FILES];
    for (int i = 1; i <= NUMBER_OF_OUTPUT_FILES; i++) {
        files[i - 1] = new File(INPUT_FILE + "_Splits\\split." + i + FILE_SUFFIX);
    }

    joinFiles(files);
}