Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/multithreading/4.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java 通过自定义筛选器拆分巨大的CSV?_Java_Multithreading_Parsing_Batch File_File Io - Fatal编程技术网

Java 通过自定义筛选器拆分巨大的CSV?

Java 通过自定义筛选器拆分巨大的CSV?,java,multithreading,parsing,batch-file,file-io,Java,Multithreading,Parsing,Batch File,File Io,我有巨大的(>5GB)CSV文件,格式如下: 用户名,事务 我希望每个用户都有一个单独的CSV文件作为输出,只有他的所有事务都采用相同的格式。我脑子里没有什么想法,但我想听听关于有效(快速和内存高效)实现的其他想法 以下是我到目前为止所做的。第一个测试是单线程读/处理/写,第二个测试是多线程。表现不是很好,所以我觉得我做错了什么。请纠正我 public class BatchFileReader { private ICsvBeanReader beanReader; private dou

我有巨大的(>5GB)CSV文件,格式如下: 用户名,事务

我希望每个用户都有一个单独的CSV文件作为输出,只有他的所有事务都采用相同的格式。我脑子里没有什么想法,但我想听听关于有效(快速和内存高效)实现的其他想法

以下是我到目前为止所做的。第一个测试是单线程读/处理/写,第二个测试是多线程。表现不是很好,所以我觉得我做错了什么。请纠正我

public class BatchFileReader {


private ICsvBeanReader beanReader;
private double total;
private String[] header;
private CellProcessor[] processors;
private DataTransformer<HashMap<String, List<LoginDto>>> processor;
private boolean hasMoreRecords = true;

public BatchFileReader(String file, DataTransformer<HashMap<String, List<LoginDto>>> processor) {
    try {
        this.processor = processor;
        this.beanReader = new CsvBeanReader(new FileReader(file), CsvPreference.STANDARD_PREFERENCE);
        header = CSVUtils.getHeader(beanReader.getHeader(true));
        processors = CSVUtils.getProcessors();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

public void read() {
    try {
        readFile();
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (beanReader != null) {
            try {
                beanReader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
}

private void readFile() throws IOException {
    while (hasMoreRecords) {

        long start = System.currentTimeMillis();

        HashMap<String, List<LoginDto>> usersBatch = readBatch();

        long end = System.currentTimeMillis();
        System.out.println("Reading batch for " + ((end - start) / 1000f) + " seconds.");
        total +=((end - start)/ 1000f);
        if (processor != null && !usersBatch.isEmpty()) {
            processor.transform(usersBatch);
        }
    }
    System.out.println("total = " + total);
}

private HashMap<String, List<LoginDto>> readBatch() throws IOException {
    HashMap<String, List<LoginDto>> users = new HashMap<String, List<LoginDto>>();
    int readLoginCount = 0;
    while (readLoginCount < CONFIG.READ_BATCH_SIZE) {
        LoginDto login = beanReader.read(LoginDto.class, header, processors);
        if (login != null) {
            if (!users.containsKey(login.getUsername())) {
                List<LoginDto> logins = new LinkedList<LoginDto>();
                users.put(login.getUsername(), logins);
            }
            users.get(login.getUsername()).add(login);
            readLoginCount++;
        } else {
            hasMoreRecords = false;
            break;
        }
    }   
    return users;
}
公共类BatchFileReader{
私有ICsvBeanReader beanReader;
私人双总;
私有字符串[]头;
专用蜂窝处理器[]处理器;
专用数据转换器处理器;
私有布尔hasMoreRecords=true;
公共BatchFileReader(字符串文件、DataTransformer处理器){
试一试{
this.processor=处理器;
this.beanReader=new CsvBeanReader(新文件读取器(文件),CsvPreference.STANDARD_首选项);
header=CSVUtils.getHeader(beanReader.getHeader(true));
processors=CSVUtils.getProcessors();
}捕获(IOE异常){
e、 printStackTrace();
}
}
公共无效读取(){
试一试{
readFile();
}捕获(IOE异常){
e、 printStackTrace();
}最后{
if(beanReader!=null){
试一试{
beanReader.close();
}捕获(IOE异常){
e、 printStackTrace();
}
}
}
}
私有void readFile()引发IOException{
while(hasMoreRecords){
长启动=System.currentTimeMillis();
HashMap usersBatch=readBatch();
long end=System.currentTimeMillis();
System.out.println(“读取批次为”+((结束-开始)/1000f)+“秒”);
总+=((结束-开始)/1000f);
if(processor!=null&&!usersBatch.isEmpty()){
处理器.transform(usersBatch);
}
}
System.out.println(“total=“+total”);
}
私有HashMap readBatch()引发IOException{
HashMap用户=新的HashMap();
int readLoginCount=0;
while(readLoginCount
}

公共类BatchFileWriter{

private final String file;

private final List<T> processedData;

public BatchFileWriter(final String file,  List<T> processedData) {
    this.file = file;
    this.processedData = processedData;
}

public void write() {
    try {
        writeFile(file, processedData);
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
    }
}

private void writeFile(final String file, final List<T> processedData) throws IOException {
    System.out.println("START WRITE " + "  " + file);
    FileWriter writer = new FileWriter(file, true);

    long start = System.currentTimeMillis();

    for (T record : processedData) {
        writer.write(record.toString());
        writer.write("\n");
    }
    writer.flush();
    writer.close();

    long end = System.currentTimeMillis();
    System.out.println("Writing in file " + file + " complete for " + ((end - start) / 1000f) + " seconds.");

}
私有最终字符串文件;
私有最终列表处理数据;
公共BatchFileWriter(最终字符串文件,列表处理数据){
this.file=文件;
this.processedData=processedData;
}
公共空写(){
试一试{
writeFile(文件、处理数据);
}捕获(IOE异常){
e、 printStackTrace();
}最后{
}
}
私有void writeFile(最终字符串文件,最终列表processedData)引发IOException{
System.out.println(“开始写入”+“”+文件);
FileWriter=newfilewriter(file,true);
长启动=System.currentTimeMillis();
对于(T记录:processedData){
writer.write(record.toString());
writer.write(“\n”);
}
writer.flush();
writer.close();
long end=System.currentTimeMillis();
System.out.println(“写入文件”+文件+”完成“+((结束-开始)/1000f)+“秒”);
}
}

公共类登录测试{

private static final ExecutorService executor = Executors.newSingleThreadExecutor();
private static final ExecutorService procExec = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() + 1);

@Test
public void testSingleThreadCSVtoCSVSplit() throws InterruptedException, ExecutionException {
    long start = System.currentTimeMillis();

    DataTransformer<HashMap<String, List<LoginDto>>> simpleSplitProcessor =  new DataTransformer<HashMap<String, List<LoginDto>>>() {
        @Override
        public void transform(HashMap<String, List<LoginDto>> data) {
            for (String field : data.keySet()) {
                new BatchFileWriter<LoginDto>(field + ".csv", data.get(field)).write();
            }
        }

    };

    BatchFileReader reader = new BatchFileReader("loadData.csv", simpleSplitProcessor);
    reader.read();
    long end = System.currentTimeMillis();
    System.out.println("TOTAL " + ((end - start)/ 1000f) + " seconds.");
}

@Test
public void testMultiThreadCSVtoCSVSplit() throws InterruptedException, ExecutionException {

    long start = System.currentTimeMillis();
    System.out.println(start);

    final DataTransformer<HashMap<String, List<LoginDto>>> simpleSplitProcessor =  new DataTransformer<HashMap<String, List<LoginDto>>>() {
        @Override
        public void transform(HashMap<String, List<LoginDto>> data) {
            System.out.println("transform");
            processAsync(data);
        }
    };
    final CountDownLatch readLatch = new CountDownLatch(1);
    executor.execute(new Runnable() {
    @Override
    public void run() {
        BatchFileReader reader = new BatchFileReader("loadData.csv", simpleSplitProcessor);
        reader.read();
        System.out.println("read latch count down");
        readLatch.countDown();
    }});
    System.out.println("read latch before await");
    readLatch.await();
    System.out.println("read latch after await");
    procExec.shutdown();
    executor.shutdown();
    long end = System.currentTimeMillis();
    System.out.println("TOTAL " + ((end - start)/ 1000f) + " seconds.");

}


private void processAsync(final HashMap<String, List<LoginDto>> data) {
    procExec.execute(new Runnable() {
        @Override
        public void run() {
            for (String field : data.keySet()) {
                writeASync(field, data.get(field));
            }
        }

    });     
}

private void writeASync(final String field, final List<LoginDto> data) {
    procExec.execute(new Runnable() {
        @Override
        public void run() {

            new BatchFileWriter<LoginDto>(field + ".csv", data).write();    
        }
    });
}
private static final executor service executor=Executors.newSingleThreadExecutor();
private static final ExecutorService procExec=Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()+1);
@试验
public void testSingleThreadCsvTocsSplit()引发InterruptedException,ExecutionException{
长启动=System.currentTimeMillis();
DataTransformer simpleSplitProcessor=新DataTransformer(){
@凌驾
公共void转换(HashMap数据){
for(字符串字段:data.keySet()){
新的BatchFileWriter(字段+“.csv”,data.get(字段)).write();
}
}
};
BatchFileReader=新的BatchFileReader(“loadData.csv”,simpleSplitProcessor);
reader.read();
long end=System.currentTimeMillis();
System.out.println(“总计”+((结束-开始)/1000f)+“秒”);
}
@试验
public void testmulti-threadcsvtocssplit()引发InterruptedException、ExecutionException{
长启动=System.currentTimeMillis();
系统输出打印项次(开始);
最终DataTransformer simpleSplitProcessor=新DataTransformer(){
@凌驾
公共void转换(HashMap数据){
System.out.println(“转换”);
processAsync(数据);
}
};
最终倒计时闩锁读取闩锁=新倒计时闩锁(1);
executor.execute(新的Runnable(){
@凌驾
公开募捐{
BatchFileReader=新的BatchFileReader(“loadData.csv”,simpleSplitProcessor);
reader.read();
System.out.println(“读取锁存倒计时”);
readlack.countDown();
}});
System.out.println(“等待前读取闩锁”);
readlack.await();
System.out.println(“等待后读取闩锁”);
procExec.shutdown();
executor.shutdown();
long end=System.currentTimeMillis();
System.out.println(“总计”+((结束-开始)/1000f)+“秒”);
}
私有void processAsync(最终哈希映射数据){
execute(新的Runnable(){
@凌驾
公开募捐{
for(字符串字段:data.keySet()){
writeASync(字段,data.get(字段));
}
}
});     
}
私有void writeASync(最终字符串字段,最终列表数据){
execute(新的Runnable(){
@凌驾
公开募捐{
Scanner s = new Scanner(new File("/my/dir/users-and-transactions.txt"));
while (s.hasNextLine()) {
    String line = s.nextLine();
    String[] tokens = line.split(",");
    String user = tokens[0];
    String transaction = tokens[1];
    PrintStream out = new PrintStream(new FileOutputStream("/my/dir/" + user, true));
    out.println(transaction);
    out.close();
}
s.close();
from("file:/myfile.csv")
.beanRef("lineParser")
.to("seda:internal-queue");

from("seda:internal-queue")
.concurrentConsumers(5)
.to("fileWriter");