Java 通过自定义筛选器拆分巨大的CSV?
我有巨大的(>5GB)CSV文件,格式如下: 用户名,事务 我希望每个用户都有一个单独的CSV文件作为输出,只有他的所有事务都采用相同的格式。我脑子里没有什么想法,但我想听听关于有效(快速和内存高效)实现的其他想法 以下是我到目前为止所做的。第一个测试是单线程读/处理/写,第二个测试是多线程。表现不是很好,所以我觉得我做错了什么。请纠正我Java 通过自定义筛选器拆分巨大的CSV?,java,multithreading,parsing,batch-file,file-io,Java,Multithreading,Parsing,Batch File,File Io,我有巨大的(>5GB)CSV文件,格式如下: 用户名,事务 我希望每个用户都有一个单独的CSV文件作为输出,只有他的所有事务都采用相同的格式。我脑子里没有什么想法,但我想听听关于有效(快速和内存高效)实现的其他想法 以下是我到目前为止所做的。第一个测试是单线程读/处理/写,第二个测试是多线程。表现不是很好,所以我觉得我做错了什么。请纠正我 public class BatchFileReader { private ICsvBeanReader beanReader; private dou
public class BatchFileReader {
private ICsvBeanReader beanReader;
private double total;
private String[] header;
private CellProcessor[] processors;
private DataTransformer<HashMap<String, List<LoginDto>>> processor;
private boolean hasMoreRecords = true;
public BatchFileReader(String file, DataTransformer<HashMap<String, List<LoginDto>>> processor) {
try {
this.processor = processor;
this.beanReader = new CsvBeanReader(new FileReader(file), CsvPreference.STANDARD_PREFERENCE);
header = CSVUtils.getHeader(beanReader.getHeader(true));
processors = CSVUtils.getProcessors();
} catch (IOException e) {
e.printStackTrace();
}
}
public void read() {
try {
readFile();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (beanReader != null) {
try {
beanReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
private void readFile() throws IOException {
while (hasMoreRecords) {
long start = System.currentTimeMillis();
HashMap<String, List<LoginDto>> usersBatch = readBatch();
long end = System.currentTimeMillis();
System.out.println("Reading batch for " + ((end - start) / 1000f) + " seconds.");
total +=((end - start)/ 1000f);
if (processor != null && !usersBatch.isEmpty()) {
processor.transform(usersBatch);
}
}
System.out.println("total = " + total);
}
private HashMap<String, List<LoginDto>> readBatch() throws IOException {
HashMap<String, List<LoginDto>> users = new HashMap<String, List<LoginDto>>();
int readLoginCount = 0;
while (readLoginCount < CONFIG.READ_BATCH_SIZE) {
LoginDto login = beanReader.read(LoginDto.class, header, processors);
if (login != null) {
if (!users.containsKey(login.getUsername())) {
List<LoginDto> logins = new LinkedList<LoginDto>();
users.put(login.getUsername(), logins);
}
users.get(login.getUsername()).add(login);
readLoginCount++;
} else {
hasMoreRecords = false;
break;
}
}
return users;
}
公共类BatchFileReader{
私有ICsvBeanReader beanReader;
私人双总;
私有字符串[]头;
专用蜂窝处理器[]处理器;
专用数据转换器处理器;
私有布尔hasMoreRecords=true;
公共BatchFileReader(字符串文件、DataTransformer处理器){
试一试{
this.processor=处理器;
this.beanReader=new CsvBeanReader(新文件读取器(文件),CsvPreference.STANDARD_首选项);
header=CSVUtils.getHeader(beanReader.getHeader(true));
processors=CSVUtils.getProcessors();
}捕获(IOE异常){
e、 printStackTrace();
}
}
公共无效读取(){
试一试{
readFile();
}捕获(IOE异常){
e、 printStackTrace();
}最后{
if(beanReader!=null){
试一试{
beanReader.close();
}捕获(IOE异常){
e、 printStackTrace();
}
}
}
}
私有void readFile()引发IOException{
while(hasMoreRecords){
长启动=System.currentTimeMillis();
HashMap usersBatch=readBatch();
long end=System.currentTimeMillis();
System.out.println(“读取批次为”+((结束-开始)/1000f)+“秒”);
总+=((结束-开始)/1000f);
if(processor!=null&&!usersBatch.isEmpty()){
处理器.transform(usersBatch);
}
}
System.out.println(“total=“+total”);
}
私有HashMap readBatch()引发IOException{
HashMap用户=新的HashMap();
int readLoginCount=0;
while(readLoginCount
}
公共类BatchFileWriter{
private final String file;
private final List<T> processedData;
public BatchFileWriter(final String file, List<T> processedData) {
this.file = file;
this.processedData = processedData;
}
public void write() {
try {
writeFile(file, processedData);
} catch (IOException e) {
e.printStackTrace();
} finally {
}
}
private void writeFile(final String file, final List<T> processedData) throws IOException {
System.out.println("START WRITE " + " " + file);
FileWriter writer = new FileWriter(file, true);
long start = System.currentTimeMillis();
for (T record : processedData) {
writer.write(record.toString());
writer.write("\n");
}
writer.flush();
writer.close();
long end = System.currentTimeMillis();
System.out.println("Writing in file " + file + " complete for " + ((end - start) / 1000f) + " seconds.");
}
私有最终字符串文件;
私有最终列表处理数据;
公共BatchFileWriter(最终字符串文件,列表处理数据){
this.file=文件;
this.processedData=processedData;
}
公共空写(){
试一试{
writeFile(文件、处理数据);
}捕获(IOE异常){
e、 printStackTrace();
}最后{
}
}
私有void writeFile(最终字符串文件,最终列表processedData)引发IOException{
System.out.println(“开始写入”+“”+文件);
FileWriter=newfilewriter(file,true);
长启动=System.currentTimeMillis();
对于(T记录:processedData){
writer.write(record.toString());
writer.write(“\n”);
}
writer.flush();
writer.close();
long end=System.currentTimeMillis();
System.out.println(“写入文件”+文件+”完成“+((结束-开始)/1000f)+“秒”);
}
}
公共类登录测试{
private static final ExecutorService executor = Executors.newSingleThreadExecutor();
private static final ExecutorService procExec = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() + 1);
@Test
public void testSingleThreadCSVtoCSVSplit() throws InterruptedException, ExecutionException {
long start = System.currentTimeMillis();
DataTransformer<HashMap<String, List<LoginDto>>> simpleSplitProcessor = new DataTransformer<HashMap<String, List<LoginDto>>>() {
@Override
public void transform(HashMap<String, List<LoginDto>> data) {
for (String field : data.keySet()) {
new BatchFileWriter<LoginDto>(field + ".csv", data.get(field)).write();
}
}
};
BatchFileReader reader = new BatchFileReader("loadData.csv", simpleSplitProcessor);
reader.read();
long end = System.currentTimeMillis();
System.out.println("TOTAL " + ((end - start)/ 1000f) + " seconds.");
}
@Test
public void testMultiThreadCSVtoCSVSplit() throws InterruptedException, ExecutionException {
long start = System.currentTimeMillis();
System.out.println(start);
final DataTransformer<HashMap<String, List<LoginDto>>> simpleSplitProcessor = new DataTransformer<HashMap<String, List<LoginDto>>>() {
@Override
public void transform(HashMap<String, List<LoginDto>> data) {
System.out.println("transform");
processAsync(data);
}
};
final CountDownLatch readLatch = new CountDownLatch(1);
executor.execute(new Runnable() {
@Override
public void run() {
BatchFileReader reader = new BatchFileReader("loadData.csv", simpleSplitProcessor);
reader.read();
System.out.println("read latch count down");
readLatch.countDown();
}});
System.out.println("read latch before await");
readLatch.await();
System.out.println("read latch after await");
procExec.shutdown();
executor.shutdown();
long end = System.currentTimeMillis();
System.out.println("TOTAL " + ((end - start)/ 1000f) + " seconds.");
}
private void processAsync(final HashMap<String, List<LoginDto>> data) {
procExec.execute(new Runnable() {
@Override
public void run() {
for (String field : data.keySet()) {
writeASync(field, data.get(field));
}
}
});
}
private void writeASync(final String field, final List<LoginDto> data) {
procExec.execute(new Runnable() {
@Override
public void run() {
new BatchFileWriter<LoginDto>(field + ".csv", data).write();
}
});
}
private static final executor service executor=Executors.newSingleThreadExecutor();
private static final ExecutorService procExec=Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()+1);
@试验
public void testSingleThreadCsvTocsSplit()引发InterruptedException,ExecutionException{
长启动=System.currentTimeMillis();
DataTransformer simpleSplitProcessor=新DataTransformer(){
@凌驾
公共void转换(HashMap数据){
for(字符串字段:data.keySet()){
新的BatchFileWriter(字段+“.csv”,data.get(字段)).write();
}
}
};
BatchFileReader=新的BatchFileReader(“loadData.csv”,simpleSplitProcessor);
reader.read();
long end=System.currentTimeMillis();
System.out.println(“总计”+((结束-开始)/1000f)+“秒”);
}
@试验
public void testmulti-threadcsvtocssplit()引发InterruptedException、ExecutionException{
长启动=System.currentTimeMillis();
系统输出打印项次(开始);
最终DataTransformer simpleSplitProcessor=新DataTransformer(){
@凌驾
公共void转换(HashMap数据){
System.out.println(“转换”);
processAsync(数据);
}
};
最终倒计时闩锁读取闩锁=新倒计时闩锁(1);
executor.execute(新的Runnable(){
@凌驾
公开募捐{
BatchFileReader=新的BatchFileReader(“loadData.csv”,simpleSplitProcessor);
reader.read();
System.out.println(“读取锁存倒计时”);
readlack.countDown();
}});
System.out.println(“等待前读取闩锁”);
readlack.await();
System.out.println(“等待后读取闩锁”);
procExec.shutdown();
executor.shutdown();
long end=System.currentTimeMillis();
System.out.println(“总计”+((结束-开始)/1000f)+“秒”);
}
私有void processAsync(最终哈希映射数据){
execute(新的Runnable(){
@凌驾
公开募捐{
for(字符串字段:data.keySet()){
writeASync(字段,data.get(字段));
}
}
});
}
私有void writeASync(最终字符串字段,最终列表数据){
execute(新的Runnable(){
@凌驾
公开募捐{
Scanner s = new Scanner(new File("/my/dir/users-and-transactions.txt"));
while (s.hasNextLine()) {
String line = s.nextLine();
String[] tokens = line.split(",");
String user = tokens[0];
String transaction = tokens[1];
PrintStream out = new PrintStream(new FileOutputStream("/my/dir/" + user, true));
out.println(transaction);
out.close();
}
s.close();
from("file:/myfile.csv")
.beanRef("lineParser")
.to("seda:internal-queue");
from("seda:internal-queue")
.concurrentConsumers(5)
.to("fileWriter");