Javascript NodeJS-内存不足:在大数据处理上终止进程错误
我有几个.csv文件需要与另一个大的.csv文件(超过300000行)进行比较,我在服务器上遇到了内存不足错误。我在一个4GB内存的服务器上运行这个程序,所以我不确定为什么会发生这种情况,但我的代码看起来是这样的。 我使用ya csv读取csv行:Javascript NodeJS-内存不足:在大数据处理上终止进程错误,javascript,node.js,csv,synchronization,Javascript,Node.js,Csv,Synchronization,我有几个.csv文件需要与另一个大的.csv文件(超过300000行)进行比较,我在服务器上遇到了内存不足错误。我在一个4GB内存的服务器上运行这个程序,所以我不确定为什么会发生这种情况,但我的代码看起来是这样的。 我使用ya csv读取csv行: var csv = require('ya-csv'); var fs = require('graceful-fs'); var async = require('async'); var first_silo = []; var second
var csv = require('ya-csv');
var fs = require('graceful-fs');
var async = require('async');
var first_silo = [];
var second_Silo = [];
var combined = [];
var reader = csv.createCsvFileReader('december_raw.csv', {columnsFromHeader:true,'separator': ','});
var first = csv.createCsvFileReader('first_data.csv', {columnsFromHeader:false,'separator': ','});
var second = csv.createCsvFileReader('second_data.csv', {columnsFromHeader:false,'separator': ','})
async.series([
//push data from other .csv files into arrays
function(callback){
first.addListener('data', function(data){
first_silo.push(data[0]);
})
first.addListener('end', function(){
callback();
})
},
function(callback){
second.addListener('data', function(data){
second_silo.push(data[0]);
});
second.addListener('end', function(data){
callback();
});
},
function(callback){
reader.addListener('data', function(data){
//compare the data from reader to each item in the first array and append the items that get a match to a .csv.
for(var i=0;i<first_silo.length;i++){
if(data[0] === first_silo[i]){
fs.appendFileSync('results.csv', data[0]+","+first_silo[i])
break;
}
}
});
},
function(callback){
reader.addListener('data', function(data){
//do the same with the first array as the second.
for(var i=0;i<second_silo.length;i++){
if(data[0] === second_silo[i]){
fs.appendFileSync('results.csv', data[0]+","+second_silo[i]);
break;
}
}
})
}
])
var csv=require('ya-csv');
var fs=要求(‘优雅-fs’);
var async=require('async');
var first_silo=[];
var second_Silo=[];
组合风险值=[];
var reader=csv.createCsvFileReader('december_raw.csv',{columnsFromHeader:true,'separator':','});
var first=csv.createCsvFileReader('first_data.csv',{columnsFromHeader:false,'separator':','});
var second=csv.createCsvFileReader('second_data.csv',{columnsFromHeader:false,'separator':','})
异步系列([
//将其他.csv文件中的数据推送到阵列中
函数(回调){
first.addListener('data',函数(data){
第一次推送(数据[0]);
})
first.addListener('end',function()){
回调();
})
},
函数(回调){
second.addListener('data',函数(data){
第二次推送(数据[0]);
});
second.addListener('end',函数(数据){
回调();
});
},
函数(回调){
reader.addListener('data',函数(data){
//将读卡器中的数据与第一个数组中的每个项目进行比较,并将获得匹配项的项目附加到.csv。
对于(var i=0;i,由于一些原因,您的算法运行效率非常低。请原谅,我将在不使用您正在使用的async.series
调用的情况下执行此操作。希望它仍然有用
第一件事:我在做一个假设。我假设您的第一个文件december_raw.csv
的数据大小小于您的第二个和第三个文件。即使不是这样,只要文件内容不超过您的内存限制,这应该仍然可以工作,而不会耗尽内存
第二,您同时加载两个阵列,而不是一次加载一个。这基本上是内存使用率的两倍
第三,我的直觉是,当你运行csv.createCsvFileReader时,你会同时在所有文件上启动流。你可能不希望这样
因为您要将两个文件与december\u raw.csv
的内容进行比较,所以最好将该文件的内容完全加载到内存中,然后使用回调函数和通用比较函数将其他两个文件与该文件进行串流比较
var csv = require('ya-csv');
var fs = require('graceful-fs');
var reader_silo = []; // a variable that holds the rows of the main csv.
var reader = csv.createCsvFileReader('december_raw.csv', {columnsFromHeader:true,'separator': ','});
reader.addListener('data', function(data){
reader_silo.push(data[0]); // load each read in row into the array
});
reader.addListener('end', function(){
//start comparing with first csv file.
compareRows('first_data.csv', function(){
// compare with second data
compareRows('second_data.csv');
});
});
// the comparison function, takes in the filename, and a callBack if there is one.
function compareRows(csvFileName, callBack){
var csvStream = csv.createCsvFileReader(csvFileName, {columnsFromHeader:false,'separator': ','}); // begin stream
csvStream.addListener('data', function(data){
for (var i = 0; i < reader_silo.length; i++) {
if(data[0] === reader_silo[i]){
fs.appendFileSync('results.csv', data[0]+","+reader_silo[i]);
break;
}
}
});
csvStream.addListener('end', function(data){
// if there's a callBack then we can execute it.
// in this case the first time it is executed there is a callBack which executes this function again with the next file.
if(callBack && typeof callBack === "function") callBack();
});
}
这里有一个更节省内存的答案,没有任何假设。
在它中,确保将最小的CSV文件作为第一个参数传递给compareRows
函数
var csv = require('ya-csv');
var fs = require('graceful-fs');
var reader_silo = []; // a variable that holds the rows of the main csv.
var reader = csv.createCsvFileReader('december_raw.csv', {columnsFromHeader:true,'separator': ','});
reader.addListener('data', function(data){
reader_silo.push(data[0]); // load each read in row into the array
});
reader.addListener('end', function(){
//start comparing with first csv file.
compareRows('first_data.csv', function(){
// compare with second data
compareRows('second_data.csv');
});
});
// the comparison function, takes in the filename, and a callBack if there is one.
function compareRows(csvFileName, callBack){
var csvStream = csv.createCsvFileReader(csvFileName, {columnsFromHeader:false,'separator': ','}); // begin stream
csvStream.addListener('data', function(data){
for (var i = 0; i < reader_silo.length; i++) {
if(data[0] === reader_silo[i]){
fs.appendFileSync('results.csv', data[0]+","+reader_silo[i]);
break;
}
}
});
csvStream.addListener('end', function(data){
// if there's a callBack then we can execute it.
// in this case the first time it is executed there is a callBack which executes this function again with the next file.
if(callBack && typeof callBack === "function") callBack();
});
}
这确实确保了您的内存效率尽可能高,只在内存中存储尽可能小的集合
var csv = require('ya-csv');
var fs = require('graceful-fs');
var smallFileName = ""; // used to see if we need to really reload the file again.
var smaller_silo = [];
compareRows('smaller.csv', 'larger.csv', function(){
compareRows('smaller.csv', 'anotherLarger.csv', function(){
smaller_silo = []; }); // done
});
function compareRows(smallerFileName, largerFileName, callBack){
var reader;
if(smallerFileName !== smallFileName){
smallFileName = smallerFileName;
reader = csv.createCsvFileReader(smallerFileName, { columnsFromHeader: true, separator: ','});
reader.addListener('data', function(data){
smaller_silo.push(data[0]);
});
reader.addListener('end', function(){
compareSmallerToLarger(largerFileName, callBack);
});
}
else{
compareSmallerToLarger(largerFileName, callBack);
}
}
function compareSmallerToLarger(largerFileName, callBack){
var csvStream = csv.createCsvFileReader( largerFileName, { columnsFromHeader: false, 'separator':','});
csvStream.addListener('data', function(data){
for (var i = 0; i < smaller_silo.length; i++) {
if(data[0] === smaller_silo[i]){
fs.appendFileSync('results.csv', data[0]+","+smaller_silo[i]);
break;
}
}
});
csvStream.addListener('end', function(data){
if(callBack && typeof callBack === "function") callBack();
});
}
var csv=require('ya-csv');
var fs=要求(‘优雅-fs’);
var smallFileName=”“;//用于查看是否需要重新加载文件。
var_筒仓=[];
compareRows('small.csv','large.csv',function(){
compareRows('small.csv','anotherlarge.csv',function(){
较小的_思洛存储器=[];});//完成
});
函数compareRows(smallerFileName、largerFileName、回调){
变量读取器;
if(smallerFileName!==smallFileName){
smallFileName=smallerFileName;
reader=csv.createCsvFileReader(smallerFileName,{columnsFromHeader:true,分隔符:',});
reader.addListener('data',函数(data){
较小的_silo.push(数据[0]);
});
reader.addListener('end',function()){
CompareSmallerToLarge(大文件名,回调);
});
}
否则{
CompareSmallerToLarge(大文件名,回调);
}
}
函数CompareSmallerToLarge(大文件名,回调){
var csvStream=csv.createCsvFileReader(大文件名,{columnsFromHeader:false,'separator':','});
csvStream.addListener('data',函数(数据){
对于(变量i=0;i<较小的筒仓长度;i++){
如果(数据[0]==更小的_思洛存储器[i]){
fs.appendFileSync('results.csv',data[0]+,“+更小的_-silo[i]);
打破
}
}
});
csvStream.addListener('end',函数(数据){
if(callBack&&typeof callBack==“function”)callBack();
});
}
无论如何,我不应该为这些事情而烦恼…它是64位服务器吗?如果是的话,使用64位节点可能会有所帮助。与上面的评论相同,但是,您提到您正在将它与另一个文件进行比较。看起来您实际上是在将它与其他两个文件进行比较?尝试使用array.forEach而不是for循环?