Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/reporting-services/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 有没有更好的方法快速生成500万个csv文件_Java_Amazon Web Services_Amazon S3 - Fatal编程技术网

Java 有没有更好的方法快速生成500万个csv文件

Java 有没有更好的方法快速生成500万个csv文件,java,amazon-web-services,amazon-s3,Java,Amazon Web Services,Amazon S3,我想创建500万个csv文件,我已经等了将近3个小时,但程序仍在运行。谁能给我一些建议,如何加快文件生成 在这500万个文件生成完成后,我必须将它们上传到s3 bucket 如果有人知道如何通过AWS生成这些文件就更好了,这样,我们可以直接将文件移动到s3存储桶,而忽略网络速度问题。(刚开始学习AWS,有很多知识需要知道) 下面是我的代码 public class ParallelCsvGenerate implements Runnable { private static Atomi

我想创建500万个csv文件,我已经等了将近3个小时,但程序仍在运行。谁能给我一些建议,如何加快文件生成

在这500万个文件生成完成后,我必须将它们上传到s3 bucket

如果有人知道如何通过AWS生成这些文件就更好了,这样,我们可以直接将文件移动到s3存储桶,而忽略网络速度问题。(刚开始学习AWS,有很多知识需要知道)

下面是我的代码

public class ParallelCsvGenerate implements Runnable {
    private static AtomicLong baseID = new AtomicLong(8160123456L);
    private static ThreadLocalRandom random = ThreadLocalRandom.current();
    private static ThreadLocalRandom random2 = ThreadLocalRandom.current();
    private static String filePath = "C:\\5millionfiles\\";
    private static List<String> headList = null;
    private static String csvHeader = null;
    public ParallelCsvGenerate() {
        headList = generateHeadList();
        csvHeader = String.join(",", headList);
    }


    @Override
    public void run() {
        for(int i = 0; i < 1000000; i++) {
            generateCSV();
        }s
    }


    private void generateCSV() {
        StringBuilder builder = new StringBuilder();
        builder.append(csvHeader).append(System.lineSeparator());
        for (int i = 0; i < headList.size(); i++) {
            if(i < headList.size() - 1) {
                builder.append(i % 2 == 0 ? generateRandomInteger() : generateRandomStr()).append(",");
            } else {
                builder.append(i % 2 == 0 ? generateRandomInteger() : generateRandomStr());
            }
        }


        String fileName = String.valueOf(baseID.addAndGet(1));
        File csvFile = new File(filePath + fileName + ".csv");
        FileWriter fileWriter = null;
        try {
            fileWriter = new FileWriter(csvFile);
            fileWriter.write(builder.toString());
            fileWriter.flush();
        } catch (Exception e) {
            System.err.println(e);
        } finally {
            try {
                if(fileWriter != null) {
                    fileWriter.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }




    private static List<String> generateHeadList() {
        List<String> headList = new ArrayList<>(20);
        String baseFiledName = "Field";
        for(int i = 1; i <=20; i++) {
            headList.add(baseFiledName + i);
        }
        return headList;
    }




    /**
     * generate a number in range of 0-50000
     * @return
     */
    private Integer generateRandomInteger() {
        return random.nextInt(0,50000);
    }




    /**
     * generate a string length is 5 - 8
     * @return
     */
    private String generateRandomStr() {
        int strLength = random2.nextInt(5, 8);
        String str="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
        int length = str.length();
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < strLength; i++) {
            builder.append(str.charAt(random.nextInt(length)));
        }
        return builder.toString();
    }

谢谢你们的建议,只要重构代码,用2.8h生成380万个文件,这会更好。 重构代码:

public class ParallelCsvGenerate implements Callable<Integer> {
    private static String filePath = "C:\\5millionfiles\\";
    private static String[] header = new String[]{
            "FIELD1","FIELD2","FIELD3","FIELD4","FIELD5",
            "FIELD6","FIELD7","FIELD8","FIELD9","FIELD10",
            "FIELD11","FIELD12","FIELD13","FIELD14","FIELD15",
            "FIELD16","FIELD17","FIELD18","FIELD19","FIELD20",
    };
    private String fileName;
    public ParallelCsvGenerate(String fileName) {
        this.fileName = fileName;
    }

    @Override
    public Integer call() throws Exception {
        try {
            generateCSV();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return 0;
    }

    private void generateCSV() throws IOException {

        CSVWriter writer = new CSVWriter(new FileWriter(filePath + fileName + ".csv"), CSVWriter.DEFAULT_SEPARATOR, CSVWriter.NO_QUOTE_CHARACTER);
        String[] content = new String[]{
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr(),
                RandomGenerator.generateRandomInteger(),
                RandomGenerator.generateRandomStr()
        };
        writer.writeNext(header);
        writer.writeNext(content);
        writer.close();
    }

}
public类ParallelCsvGenerate实现了可调用{
私有静态字符串filePath=“C:\\5millionfiles\\”;
私有静态字符串[]头=新字符串[]{
“字段1”、“字段2”、“字段3”、“字段4”、“字段5”,
“字段6”、“字段7”、“字段8”、“字段9”、“字段10”,
“字段11”、“字段12”、“字段13”、“字段14”、“字段15”,
“字段16”、“字段17”、“字段18”、“字段19”、“字段20”,
};
私有字符串文件名;
公共并行CSVgenerate(字符串文件名){
this.fileName=文件名;
}
@凌驾
公共整数调用()引发异常{
试一试{
generateCSV();
}捕获(IOE异常){
e、 printStackTrace();
}
返回0;
}
私有void generateCSV()引发IOException{
CSVWriter writer=new CSVWriter(new FileWriter(filePath+fileName+“.csv”)、CSVWriter.DEFAULT_分隔符、CSVWriter.NO_QUOTE_字符);
字符串[]内容=新字符串[]{
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr()
};
writer.writeNext(标题);
writer.writeNext(内容);
writer.close();
}
}
主要

publicstaticvoidmain(字符串[]args){
System.out.println(“开始生成”);
长启动=System.currentTimeMillis();
ThreadPoolExecutor ThreadPoolExecutor=新的ThreadPoolExecutor(8,8,
0L,时间单位为毫秒,
新建LinkedBlockingQueue());
List taskList=new ArrayList(3800000);
对于(int i=0;i<3800000;i++){
添加(新的ParallelCsvGenerate(i+“”));
}
试一试{
List futures=threadPoolExecutor.invokeAll(任务列表);
}捕捉(中断异常e){
e、 printStackTrace();
}
System.out.println(“成功”);
long end=System.currentTimeMillis();
System.out.println(“使用时间:+(结束-开始));
}
  • 您可以直接写入文件(无需在一个StringBuilder中分配整个文件)。(我认为这是时间+内存的最大瓶颈:
    builder.toString()

  • 您可以并行生成每个文件

  • (小调整:)省略if的内部循环

    如果不需要(i
    ,则执行更聪明的循环+1次额外迭代

    i%2==0
    可以通过更好的迭代(
    i+=2
    )和更多的循环内部劳动(
    i->int,i+1->string
    )来消除

  • 如果适用,请将
    append(char)
    改为
    append(String)
    。(与附加(“,”)相比,附加(“,”)更好。


  • 您可以使用Fork/Join框架(Java7及更高版本)使您的进程并行并使用多核Cpu。 我给你举个例子

    import java.util.concurrent.ForkJoinPool;
    import java.util.concurrent.ForkJoinTask;
    import java.util.concurrent.RecursiveTask;
    import java.util.stream.LongStream;
    
    public class ForkJoinAdd extends RecursiveTask<Long> {
    
        private final long[] numbers;
        private final int start;
        private final int end;
        public static final long threshold = 10_000;
    
        public ForkJoinAdd(long[] numbers) {
            this(numbers, 0, numbers.length);
        }
    
        private ForkJoinAdd(long[] numbers, int start, int end) {
            this.numbers = numbers;
            this.start = start;
            this.end = end;
        }
    
        @Override
        protected Long compute() {
    
            int length = end - start;
            if (length <= threshold) {
                return add();
            }
    
            ForkJoinAdd firstTask = new ForkJoinAdd(numbers, start, start + length / 2);
            firstTask.fork(); //start asynchronously
    
            ForkJoinAdd secondTask = new ForkJoinAdd(numbers, start + length / 2, end);
    
            Long secondTaskResult = secondTask.compute();
            Long firstTaskResult = firstTask.join();
    
            return firstTaskResult + secondTaskResult;
    
        }
    
        private long add() {
            long result = 0;
            for (int i = start; i < end; i++) {
                result += numbers[i];
            }
            return result;
        }
    
        public static long startForkJoinSum(long n) {
            long[] numbers = LongStream.rangeClosed(1, n).toArray();
            ForkJoinTask<Long> task = new ForkJoinAdd(numbers);
            return new ForkJoinPool().invoke(task);
        }
    
    }
    
    import java.util.concurrent.ForkJoinPool;
    导入java.util.concurrent.ForkJoinTask;
    导入java.util.concurrent.RecursiveTask;
    导入java.util.stream.LongStream;
    公共类ForkJoinAdd扩展递归任务{
    私人最终长[]号;
    私人最终启动;
    私人终端;
    公共静态最终长阈值=10_000;
    公共ForkJoinAdd(长[]个数字){
    这(数字,0,数字,长度);
    }
    专用ForkJoinAdd(长[]个数字,整数开始,整数结束){
    这个。数字=数字;
    this.start=start;
    this.end=end;
    }
    @凌驾
    受保护的长计算(){
    int长度=结束-开始;
    如果(长度
    • run
      方法中删除(int i=0;i<1000000;i++)的
      循环(保留一个
      generateCSV()
      调用)
    • 创建500万个
      ParallelCsvGenerate
      对象
    • 将它们提交给
      ThreadPoolExecutor
    已转换的
    main

    public static void main(String[] args) {    
        ThreadPoolExecutor ex = (ThreadPoolExecutor) Executors.newFixedThreadPool(8);
        for(int i = 0; i < 5000000; i++) {
            ParallelCsvGenerate generate = new ParallelCsvGenerate();
            ex.submit(generate);
        }
        ex.shutdown();
    }
    
    以实现30%的加速


    我认为主要的瓶颈是硬盘驱动器和文件系统本身。在这里无法实现更多。

    在多台机器上使用多个进程向我们展示您的
    main
    。您是否创建了多个<
    import java.util.concurrent.ForkJoinPool;
    import java.util.concurrent.ForkJoinTask;
    import java.util.concurrent.RecursiveTask;
    import java.util.stream.LongStream;
    
    public class ForkJoinAdd extends RecursiveTask<Long> {
    
        private final long[] numbers;
        private final int start;
        private final int end;
        public static final long threshold = 10_000;
    
        public ForkJoinAdd(long[] numbers) {
            this(numbers, 0, numbers.length);
        }
    
        private ForkJoinAdd(long[] numbers, int start, int end) {
            this.numbers = numbers;
            this.start = start;
            this.end = end;
        }
    
        @Override
        protected Long compute() {
    
            int length = end - start;
            if (length <= threshold) {
                return add();
            }
    
            ForkJoinAdd firstTask = new ForkJoinAdd(numbers, start, start + length / 2);
            firstTask.fork(); //start asynchronously
    
            ForkJoinAdd secondTask = new ForkJoinAdd(numbers, start + length / 2, end);
    
            Long secondTaskResult = secondTask.compute();
            Long firstTaskResult = firstTask.join();
    
            return firstTaskResult + secondTaskResult;
    
        }
    
        private long add() {
            long result = 0;
            for (int i = start; i < end; i++) {
                result += numbers[i];
            }
            return result;
        }
    
        public static long startForkJoinSum(long n) {
            long[] numbers = LongStream.rangeClosed(1, n).toArray();
            ForkJoinTask<Long> task = new ForkJoinAdd(numbers);
            return new ForkJoinPool().invoke(task);
        }
    
    }
    
    public static void main(String[] args) {    
        ThreadPoolExecutor ex = (ThreadPoolExecutor) Executors.newFixedThreadPool(8);
        for(int i = 0; i < 5000000; i++) {
            ParallelCsvGenerate generate = new ParallelCsvGenerate();
            ex.submit(generate);
        }
        ex.shutdown();
    }
    
        Path file = Paths.get(filePath + fileName + ".csv");
        try(AsynchronousFileChannel asyncFile = AsynchronousFileChannel.open(file,
                            StandardOpenOption.WRITE,
                            StandardOpenOption.CREATE)) {
    
            asyncFile.write(ByteBuffer.wrap(builder.toString().getBytes()), 0);
        } catch (IOException e) {
            e.printStackTrace();
        }