java将大文件拆分为小文件，同时拆分多行记录，而不会在不完整状态下破坏记录_Java_Spring Batch

java将大文件拆分为小文件，同时拆分多行记录，而不会在不完整状态下破坏记录

java spring-batch

java将大文件拆分为小文件，同时拆分多行记录，而不会在不完整状态下破坏记录,java,spring-batch,Java,Spring Batch,我在一个文件中将一条记录拆分为多行。识别记录结尾的唯一方法是当新记录以ABC开头时。下面是示例。文件大小可能是5-10GB，我正在寻找一种高效的java逻辑，它只用于分割文件（不需要读取每一行），但是分割逻辑应该检查以新记录开始一个新文件，在这种情况下，它应该以“ABC”开头添加了更多的细节，我只是在寻找分割文件，而分割最后一条记录应该在文件中正确结束有人能推荐一下吗 HDR ABCline1goesonforrecord1 //first record line2goesonForR

我在一个文件中将一条记录拆分为多行。识别记录结尾的唯一方法是当新记录以ABC开头时。下面是示例。文件大小可能是5-10GB，我正在寻找一种高效的java逻辑，它只用于分割文件（不需要读取每一行），但是分割逻辑应该检查以新记录开始一个新文件，在这种情况下，它应该以“ABC”开头

添加了更多的细节，我只是在寻找分割文件，而分割最后一条记录应该在文件中正确结束

有人能推荐一下吗

HDR
ABCline1goesonforrecord1   //first record 
line2goesonForRecord1      
line3goesonForRecord1          
line4goesonForRecord1
ABCline2goesOnForRecord2  //second record
line2goesonForRecord2
line3goesonForRecord2
line4goesonForRecord2
line5goesonForRecord2
ABCline2goesOnForRecord3     //third record
line2goesonForRecord3
line3goesonForRecord3
line4goesonForRecord3
TRL

我没有测试这个，但是像这样的东西应该可以工作，你没有在内存中读取整个文件，一次只读取一行，所以它应该不坏

public void spiltRecords(String filename) {
        /*
            HDR
            ABCline1goesonforrecord1   //first record
            line2goesonForRecord1
            line3goesonForRecord1
            line4goesonForRecord1
            ABCline2goesOnForRecord2  //second record
            line2goesonForRecord2
            line3goesonForRecord2
            line4goesonForRecord2
            line5goesonForRecord2
            ABCline2goesOnForRecord3     //third record
            line2goesonForRecord3
            line3goesonForRecord3
            line4goesonForRecord3
            TRL
         */
        try {
            Scanner scanFile = new Scanner(new File(filename));
            // now you do not want to edit the existing file in case things go wrong. one way is to get list of index
            // where a new record starts.
            LinkedList<Long> startOfRecordIndexes = new LinkedList<>();
            long index = 0;
            while (scanFile.hasNext()) {
                if (scanFile.nextLine().startsWith("ABC")) {
                    startOfRecordIndexes.add(index);
                }
                index++;
            }

            // Once you have the starting index for all records you can iterate through the list and create new records
            scanFile = scanFile.reset();
            index = 0;

            BufferedWriter writer = null;
            
            while (scanFile.hasNext()) {
                if (!startOfRecordIndexes.isEmpty() && index == startOfRecordIndexes.peek()) {
                    if(writer != null) {
                        writer.write("TRL");
                        writer.close();
                    }
                    writer = new BufferedWriter(new OutputStreamWriter(
                        new FileOutputStream("Give unique filename"), StandardCharsets.UTF_8));
                    writer.write("HDR");
                    writer.write(scanFile.nextLine());

                    startOfRecordIndexes.remove();
                } else {
                    writer.write(scanFile.nextLine());
                }
            }
            // Close the last record
            if(writer != null) {
                writer.write("TRL");
                writer.close();
            }
        } catch (IOException e) {
            // deal with exception
        }
    }

public void spiltRecords（字符串文件名）{
/*
HDR
ABCline1goesonforrecord1//第一条记录
第2行记录的OESON1
第3行进入记录1
行4goeson for record1
ABCline2goesOnForRecord2//第二条记录
第2行记录的OESON2
第3行进入记录2
第4行用于记录的OESON2
行5GOESONFORRECORD 2
ABCline2goesOnForRecord3//第三条记录
第2行记录的OESON3
第3行进入记录3
第4行用于记录的OESON3
TRL
*/
试一试{
扫描仪扫描文件=新扫描仪（新文件（文件名））；
//现在，您不想编辑现有文件以防出错。一种方法是获取索引列表
//新记录开始的地方。
LinkedList startOfRecordIndexes=新建LinkedList（）；
长指数=0；
while（scanFile.hasNext（））{
if（scanFile.nextLine（）.startsWith（“ABC”））{
开始记录索引。添加（索引）；
}
索引++；
}
//一旦拥有了所有记录的起始索引，就可以遍历列表并创建新记录
scanFile=scanFile.reset（）；
指数=0；
BufferedWriter=null；
while（scanFile.hasNext（））{
如果（！startOfRecordIndexes.isEmpty（）&&index==startOfRecordIndexes.peek（））{
if（writer！=null）{
writer.write（“TRL”）；
writer.close（）；
}
writer=new BufferedWriter（new OutputStreamWriter(
新的FileOutputStream（“给出唯一的文件名”），StandardCharsets.UTF_8）；
writer.write（“HDR”）；
writer.write（scanFile.nextLine（））；
startOfRecordIndexes.remove（）；
}否则{
writer.write（scanFile.nextLine（））；
}
}
//关闭最后一条记录
if（writer！=null）{
writer.write（“TRL”）；
writer.close（）；
}
}捕获（IOE异常）{
//处理例外
}
}

因此，这是您需要的代码。我在一个10Gb文件上进行了测试，分割该文件需要64秒

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.concurrent.TimeUnit;

public class FileSplitter {

    private final Path filePath;
    private BufferedWriter writer;
    private int fileCounter = 1;

    public static void main(String[] args) throws Exception {
        long startTime = System.nanoTime();
        new FileSplitter(Path.of("/tmp/bigfile.txt")).split();
        System.out.println("Time to split " + TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - startTime));
    }

    private static void generateBigFile() throws Exception {
        var writer = Files.newBufferedWriter(Path.of("/tmp/bigfile.txt"), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
        for (int i = 0; i < 100_000; i++) {
            writer.write(String.format("ABCline1goesonforrecord%d\n", i + 1));
            for (int j = 0; j < 10_000; j++) {
                writer.write(String.format("line%dgoesonForRecord%d\n", j + 2, i + 1));
            }
        }

        writer.flush();
        writer.close();
    }

    public FileSplitter(Path filePath) {
        this.filePath = filePath;
    }

    void split() throws IOException {
        try (var stream = Files.lines(filePath, StandardCharsets.UTF_8)) {
            stream.forEach(line -> {
                if (line.startsWith("ABC")) {
                    closeWriter();
                    openWriter();
                }
                writeLine(line);
            });
        }
        closeWriter();
    }

    private void writeLine(String line) {
        if (writer != null) {
            try {
                writer.write(line);
                writer.write("\n");
            } catch (IOException e) {
                throw new UncheckedIOException("Failed to write line to file part", e);
            }
        }
    }

    private void openWriter() {
        if (this.writer == null) {
            var filePartName = filePath.getFileName().toString().replace(".", "_part" + fileCounter + ".");
            try {
                writer = Files.newBufferedWriter(Path.of("/tmp/split", filePartName), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
            } catch (IOException e) {
                throw new UncheckedIOException("Failed to write line to file", e);
            }
            fileCounter++;
        }
    }

    private void closeWriter() {
        if (writer != null) {
            try {
                writer.flush();
                writer.close();
                writer = null;
            } catch (IOException e) {
                throw new UncheckedIOException("Failed to close writer", e);
            }
        }
    }
}

导入java.io.BufferedWriter；
导入java.io.IOException；
导入java.io.UncheckedIOException；
导入java.nio.charset.StandardCharset；
导入java.nio.file.Files；
导入java.nio.file.Path；
导入java.nio.file.StandardOpenOption；
导入java.util.concurrent.TimeUnit；
公共类文件剥离器{
私有最终路径文件路径；
私有缓冲写入程序；
私有int fileCounter=1；
公共静态void main（字符串[]args）引发异常{
long startTime=System.nanoTime（）；
新的filespliter（Path.of（“/tmp/bigfile.txt”）.split（）；
System.out.println（“分割时间”+时间单位.NANOSECONDS.toSeconds（System.nanoTime（）-startTime））；
}
私有静态void generateBigFile（）引发异常{
var writer=Files.newBufferedWriter（Path.of（“/tmp/bigfile.txt”）、StandardOpenOption.CREATE、StandardOpenOption.TRUNCATE\u EXISTING）；
对于（int i=0；i<100_000；i++）{
writer.write（String.format（“ABCline1goesonforrecord%d\n”，i+1））；
对于（int j=0；j<10_000；j++）{
writer.write（String.format（“记录%d\n的行%dgoeson”，j+2，i+1））；
}
}
writer.flush（）；
writer.close（）；
}
公共filespliter（路径filePath）{
this.filePath=filePath；
}
void split（）引发异常{
try（var stream=Files.lines（filePath，StandardCharsets.UTF_8））{
stream.forEach（行->{
if（行起始带（“ABC”））{
closeWriter（）；
openWriter（）；
}
写线（行）；
});
}
closeWriter（）；
}
专用void writeLine（字符串行）{
if（writer！=null）{
试一试{
作者：写（行）；
writer.write（“\n”）；
}捕获（IOE异常）{
抛出新的未选中异常（“未能将行写入文件部分”，e）；
}
}
}
私有void openWriter（）{
if（this.writer==null）{
var filePartName=filePath.getFileName（）.toString（）.replace（“.”，“_part”+fileCounter+”）；
试一试{
writer=Files.newBufferedWriter（Path.of（“/tmp/split”，filePartName），StandardOpenOption.CREATE，StandardOpenOption.TRUNCATE_-EXISTING）；
}捕获（IOE异常）{
抛出新的未选中异常（“未能将行写入文件”，e）；
}
fileCounter++；
}
}
私有编写器（）{
if（writer！=null）{
试一试{
writer.flush（）；
writer.close（）；