Java 有没有更好的方法快速生成500万个csv文件
我想创建500万个csv文件,我已经等了将近3个小时,但程序仍在运行。谁能给我一些建议,如何加快文件生成 在这500万个文件生成完成后,我必须将它们上传到s3 bucket 如果有人知道如何通过AWS生成这些文件就更好了,这样,我们可以直接将文件移动到s3存储桶,而忽略网络速度问题。(刚开始学习AWS,有很多知识需要知道) 下面是我的代码Java 有没有更好的方法快速生成500万个csv文件,java,amazon-web-services,amazon-s3,Java,Amazon Web Services,Amazon S3,我想创建500万个csv文件,我已经等了将近3个小时,但程序仍在运行。谁能给我一些建议,如何加快文件生成 在这500万个文件生成完成后,我必须将它们上传到s3 bucket 如果有人知道如何通过AWS生成这些文件就更好了,这样,我们可以直接将文件移动到s3存储桶,而忽略网络速度问题。(刚开始学习AWS,有很多知识需要知道) 下面是我的代码 public class ParallelCsvGenerate implements Runnable { private static Atomi
public class ParallelCsvGenerate implements Runnable {
private static AtomicLong baseID = new AtomicLong(8160123456L);
private static ThreadLocalRandom random = ThreadLocalRandom.current();
private static ThreadLocalRandom random2 = ThreadLocalRandom.current();
private static String filePath = "C:\\5millionfiles\\";
private static List<String> headList = null;
private static String csvHeader = null;
public ParallelCsvGenerate() {
headList = generateHeadList();
csvHeader = String.join(",", headList);
}
@Override
public void run() {
for(int i = 0; i < 1000000; i++) {
generateCSV();
}s
}
private void generateCSV() {
StringBuilder builder = new StringBuilder();
builder.append(csvHeader).append(System.lineSeparator());
for (int i = 0; i < headList.size(); i++) {
if(i < headList.size() - 1) {
builder.append(i % 2 == 0 ? generateRandomInteger() : generateRandomStr()).append(",");
} else {
builder.append(i % 2 == 0 ? generateRandomInteger() : generateRandomStr());
}
}
String fileName = String.valueOf(baseID.addAndGet(1));
File csvFile = new File(filePath + fileName + ".csv");
FileWriter fileWriter = null;
try {
fileWriter = new FileWriter(csvFile);
fileWriter.write(builder.toString());
fileWriter.flush();
} catch (Exception e) {
System.err.println(e);
} finally {
try {
if(fileWriter != null) {
fileWriter.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static List<String> generateHeadList() {
List<String> headList = new ArrayList<>(20);
String baseFiledName = "Field";
for(int i = 1; i <=20; i++) {
headList.add(baseFiledName + i);
}
return headList;
}
/**
* generate a number in range of 0-50000
* @return
*/
private Integer generateRandomInteger() {
return random.nextInt(0,50000);
}
/**
* generate a string length is 5 - 8
* @return
*/
private String generateRandomStr() {
int strLength = random2.nextInt(5, 8);
String str="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
int length = str.length();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < strLength; i++) {
builder.append(str.charAt(random.nextInt(length)));
}
return builder.toString();
}
谢谢你们的建议,只要重构代码,用2.8h生成380万个文件,这会更好。
重构代码:
public class ParallelCsvGenerate implements Callable<Integer> {
private static String filePath = "C:\\5millionfiles\\";
private static String[] header = new String[]{
"FIELD1","FIELD2","FIELD3","FIELD4","FIELD5",
"FIELD6","FIELD7","FIELD8","FIELD9","FIELD10",
"FIELD11","FIELD12","FIELD13","FIELD14","FIELD15",
"FIELD16","FIELD17","FIELD18","FIELD19","FIELD20",
};
private String fileName;
public ParallelCsvGenerate(String fileName) {
this.fileName = fileName;
}
@Override
public Integer call() throws Exception {
try {
generateCSV();
} catch (IOException e) {
e.printStackTrace();
}
return 0;
}
private void generateCSV() throws IOException {
CSVWriter writer = new CSVWriter(new FileWriter(filePath + fileName + ".csv"), CSVWriter.DEFAULT_SEPARATOR, CSVWriter.NO_QUOTE_CHARACTER);
String[] content = new String[]{
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr(),
RandomGenerator.generateRandomInteger(),
RandomGenerator.generateRandomStr()
};
writer.writeNext(header);
writer.writeNext(content);
writer.close();
}
}
public类ParallelCsvGenerate实现了可调用{
私有静态字符串filePath=“C:\\5millionfiles\\”;
私有静态字符串[]头=新字符串[]{
“字段1”、“字段2”、“字段3”、“字段4”、“字段5”,
“字段6”、“字段7”、“字段8”、“字段9”、“字段10”,
“字段11”、“字段12”、“字段13”、“字段14”、“字段15”,
“字段16”、“字段17”、“字段18”、“字段19”、“字段20”,
};
私有字符串文件名;
公共并行CSVgenerate(字符串文件名){
this.fileName=文件名;
}
@凌驾
公共整数调用()引发异常{
试一试{
generateCSV();
}捕获(IOE异常){
e、 printStackTrace();
}
返回0;
}
私有void generateCSV()引发IOException{
CSVWriter writer=new CSVWriter(new FileWriter(filePath+fileName+“.csv”)、CSVWriter.DEFAULT_分隔符、CSVWriter.NO_QUOTE_字符);
字符串[]内容=新字符串[]{
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr(),
RandomGenerator.GeneratorDomainInteger(),
RandomGenerator.GeneratorAndomStr()
};
writer.writeNext(标题);
writer.writeNext(内容);
writer.close();
}
}
主要
publicstaticvoidmain(字符串[]args){
System.out.println(“开始生成”);
长启动=System.currentTimeMillis();
ThreadPoolExecutor ThreadPoolExecutor=新的ThreadPoolExecutor(8,8,
0L,时间单位为毫秒,
新建LinkedBlockingQueue());
List taskList=new ArrayList(3800000);
对于(int i=0;i<3800000;i++){
添加(新的ParallelCsvGenerate(i+“”));
}
试一试{
List futures=threadPoolExecutor.invokeAll(任务列表);
}捕捉(中断异常e){
e、 printStackTrace();
}
System.out.println(“成功”);
long end=System.currentTimeMillis();
System.out.println(“使用时间:+(结束-开始));
}
builder.toString()
)
如果不需要(i
,则执行更聪明的循环+1次额外迭代
i%2==0
可以通过更好的迭代(i+=2
)和更多的循环内部劳动(i->int,i+1->string
)来消除
append(char)
改为append(String)
。(与附加(“,”)相比,附加(“,”)更好。)
…您可以使用Fork/Join框架(Java7及更高版本)使您的进程并行并使用多核Cpu。 我给你举个例子
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;
import java.util.stream.LongStream;
public class ForkJoinAdd extends RecursiveTask<Long> {
private final long[] numbers;
private final int start;
private final int end;
public static final long threshold = 10_000;
public ForkJoinAdd(long[] numbers) {
this(numbers, 0, numbers.length);
}
private ForkJoinAdd(long[] numbers, int start, int end) {
this.numbers = numbers;
this.start = start;
this.end = end;
}
@Override
protected Long compute() {
int length = end - start;
if (length <= threshold) {
return add();
}
ForkJoinAdd firstTask = new ForkJoinAdd(numbers, start, start + length / 2);
firstTask.fork(); //start asynchronously
ForkJoinAdd secondTask = new ForkJoinAdd(numbers, start + length / 2, end);
Long secondTaskResult = secondTask.compute();
Long firstTaskResult = firstTask.join();
return firstTaskResult + secondTaskResult;
}
private long add() {
long result = 0;
for (int i = start; i < end; i++) {
result += numbers[i];
}
return result;
}
public static long startForkJoinSum(long n) {
long[] numbers = LongStream.rangeClosed(1, n).toArray();
ForkJoinTask<Long> task = new ForkJoinAdd(numbers);
return new ForkJoinPool().invoke(task);
}
}
import java.util.concurrent.ForkJoinPool;
导入java.util.concurrent.ForkJoinTask;
导入java.util.concurrent.RecursiveTask;
导入java.util.stream.LongStream;
公共类ForkJoinAdd扩展递归任务{
私人最终长[]号;
私人最终启动;
私人终端;
公共静态最终长阈值=10_000;
公共ForkJoinAdd(长[]个数字){
这(数字,0,数字,长度);
}
专用ForkJoinAdd(长[]个数字,整数开始,整数结束){
这个。数字=数字;
this.start=start;
this.end=end;
}
@凌驾
受保护的长计算(){
int长度=结束-开始;
如果(长度
- 从
run
方法中删除(int i=0;i<1000000;i++)的循环(保留一个generateCSV()
调用)
- 创建500万个
ParallelCsvGenerate
对象
- 将它们提交给
ThreadPoolExecutor
已转换的main
:
public static void main(String[] args) {
ThreadPoolExecutor ex = (ThreadPoolExecutor) Executors.newFixedThreadPool(8);
for(int i = 0; i < 5000000; i++) {
ParallelCsvGenerate generate = new ParallelCsvGenerate();
ex.submit(generate);
}
ex.shutdown();
}
以实现30%的加速
我认为主要的瓶颈是硬盘驱动器和文件系统本身。在这里无法实现更多。在多台机器上使用多个进程向我们展示您的main
。您是否创建了多个<
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;
import java.util.stream.LongStream;
public class ForkJoinAdd extends RecursiveTask<Long> {
private final long[] numbers;
private final int start;
private final int end;
public static final long threshold = 10_000;
public ForkJoinAdd(long[] numbers) {
this(numbers, 0, numbers.length);
}
private ForkJoinAdd(long[] numbers, int start, int end) {
this.numbers = numbers;
this.start = start;
this.end = end;
}
@Override
protected Long compute() {
int length = end - start;
if (length <= threshold) {
return add();
}
ForkJoinAdd firstTask = new ForkJoinAdd(numbers, start, start + length / 2);
firstTask.fork(); //start asynchronously
ForkJoinAdd secondTask = new ForkJoinAdd(numbers, start + length / 2, end);
Long secondTaskResult = secondTask.compute();
Long firstTaskResult = firstTask.join();
return firstTaskResult + secondTaskResult;
}
private long add() {
long result = 0;
for (int i = start; i < end; i++) {
result += numbers[i];
}
return result;
}
public static long startForkJoinSum(long n) {
long[] numbers = LongStream.rangeClosed(1, n).toArray();
ForkJoinTask<Long> task = new ForkJoinAdd(numbers);
return new ForkJoinPool().invoke(task);
}
}
public static void main(String[] args) {
ThreadPoolExecutor ex = (ThreadPoolExecutor) Executors.newFixedThreadPool(8);
for(int i = 0; i < 5000000; i++) {
ParallelCsvGenerate generate = new ParallelCsvGenerate();
ex.submit(generate);
}
ex.shutdown();
}
Path file = Paths.get(filePath + fileName + ".csv");
try(AsynchronousFileChannel asyncFile = AsynchronousFileChannel.open(file,
StandardOpenOption.WRITE,
StandardOpenOption.CREATE)) {
asyncFile.write(ByteBuffer.wrap(builder.toString().getBytes()), 0);
} catch (IOException e) {
e.printStackTrace();
}