Javascript 在nodejs中读取和处理大文件(1-5gig)最有效的方法是什么?
我有一个使用fs.createreadstream逐行处理大型xml文件并将其保存到数组的代码。如果数组计数达到限制(例如:20),我暂停createreadstream,然后使用async.queue处理该数组Javascript 在nodejs中读取和处理大文件(1-5gig)最有效的方法是什么?,javascript,node.js,neo4j,Javascript,Node.js,Neo4j,我有一个使用fs.createreadstream逐行处理大型xml文件并将其保存到数组的代码。如果数组计数达到限制(例如:20),我暂停createreadstream,然后使用async.queue处理该数组 // code var fs = require('fs'); var util = require('util'); var stream = require('stream'); var es = require("event-stream"); var async
// code
var fs = require('fs');
var util = require('util');
var stream = require('stream');
var es = require("event-stream");
var async = require('async');
var request = require("request");
var neo4j = require('node-neo4j');
var exec = require('child_process').exec;
var util = require('util');
var dbPediaExtractor = require('./dbPediaExtractor');
var dbpExt = new dbPediaExtractor();
var host = 'localhost';
var port = 7474;
var username = "neo4j";
var password = "12345";
var auth = "Basic " + new Buffer(username + ":" + password).toString("base64");
var httpUrlForTransaction = 'http://' + host + ':' + port + '/db/data/transaction/commit';
var limit = 200;
var concurrency = 200;
var action = process.argv[2];
var defLimit = process.argv[3];
var defConcurrency = process.argv[4];
var _ntfile = action.replace('.bz2','');
var queryArray = [];
function runCypherQuery(statements, callback) {
var statements = { "statements": [ {
"statement" : statements
} ] };
request.post({
uri: httpUrlForTransaction,
json: statements,
headers : {
"Authorization" : auth
}
},
function (err, res, body) {
callback(err, body);
})
};
var carrier = function(item,cb) {
var q = async.queue(function (task, callback) {
dbpExt.run(task,function(err,querString){
if(err) {
callback();
}else{
queryArray.push(querString);
callback();
}
});
}, concurrency);
q.drain = function() {
var message = 'finished processing '+item.length+" number of queries ";
message = message + "with concurrency of "+concurrency+" processe(s)";
console.log(message);
cb();
};
for(i in item) {
q.push(item[i]);
}
};
var exCallback = function(error, stdout, stderr){
if (error !== null) {
console.log("error: \n",error);
} else {
var counter = 0;
var itemJson = [];
rstream = fs.createReadStream(_ntfile)
.pipe(es.split())
.pipe(es.mapSync(function(line) {
rstream.pause();
line = line.toString();
urlArrays = line.split(" ");
var res_urls = new Array();
if(urlArrays) {
for(i in urlArrays){
var urlSearch = urlArrays[i].match(/<(.*?)>/);
if (urlSearch != null) { res_urls.push(urlSearch[1]); };
}
};
if(itemJson.length < limit) {
itemJson.push(res_urls[0]);
rstream.resume();
}else{
console.log(itemJson);
carrier(itemJson,function(err) {
if(err) {
console.log("reply status: ",err);
}
querString = queryArray.join(" ");
runCypherQuery(querString, function (err, resp) {
if (err) {
callback(err);
} else {
// resp = resp.strin
console.log(querString);
queryArray = [];
itemJson = [];
rstream.resume();
}
});
});
}
})
.on('error', function(err){
console.log('Error while reading file.');
console.log(err);
})
.on('end', function() {
console.log('end na!!! <3');
fs.unlink(_ntfile, function(err){
if(err){
console.log('file was not deleted');
}else{
console.log('uploading of files is done');
}
});
})
.on('close', function() {
console.log(itemJson);
carrier(itemJson,function(err) {
if(err) {
console.log("reply status: ",err);
}
querString = queryArray.join(" ");
runCypherQuery(querString, function (err, resp) {
if (err) {
callback(err);
} else {
// resp = resp.strin
console.log(querString);
queryArray = [];
}
});
itemJson = [];
});
}));
}
};
if(typeof(action) != "undefined") {
if(typeof(defLimit) != 'undefined') {
limit = (!isNaN(defLimit)) ? defLimit : limit;
}
if(typeof(defConcurrency) != 'undefined') {
concurrency = (!isNaN(defConcurrency)) ? defConcurrency : concurrency;
}
var cli_code = "bzip2 -dk " + action;
child = exec(cli_code,exCallback);
};
//代码
var fs=需要('fs');
var util=require('util');
var stream=require(‘stream’);
var es=要求(“事件流”);
var async=require('async');
var请求=要求(“请求”);
var neo4j=require('node-neo4j');
var exec=require('child_process')。exec;
var util=require('util');
var dbPediaExtractor=require('./dbPediaExtractor');
var dbpExt=new dbPediaExtractor();
var host='localhost';
var端口=7474;
var username=“neo4j”;
var password=“12345”;
var auth=“Basic”+新缓冲区(用户名+:“+密码).toString(“base64”);
var httpUrlForTransaction='http://'+host+':'+port+'/db/data/transaction/commit';
var限值=200;
var并发=200;
var action=process.argv[2];
var defLimit=process.argv[3];
var defConcurrency=process.argv[4];
var _ntfile=action.replace('.bz2','');
变量queryArray=[];
函数runCypherQuery(语句、回调){
var语句={“语句”:[{
“声明”:声明
} ] };
请寄({
uri:httpUrlForTransaction,
json:语句,
标题:{
“授权”:授权
}
},
功能(错误、恢复、正文){
回调(err,body);
})
};
var载体=功能(项目,cb){
var q=async.queue(函数(任务、回调){
run(任务,函数(err,querystring){
如果(错误){
回调();
}否则{
queryArray.push(querystring);
回调();
}
});
},并发性);
q、 drain=函数(){
var message='finished processing'+item.length+“查询数”;
message=message+,并发性为“+concurrency+”进程;
控制台日志(消息);
cb();
};
第(i)项{
q、 推动(第[i]项);
}
};
var exCallback=函数(错误、标准输出、标准输出){
如果(错误!==null){
日志(“错误:\n”,错误);
}否则{
var计数器=0;
var itemJson=[];
rstream=fs.createReadStream(\u ntfile)
.pipe(es.split())
.pipe(es.mapSync(函数(行){
restream.pause();
line=line.toString();
urlArrays=line.split(“”);
var res_url=新数组();
如果(URL数组){
for(URL数组中的i){
var urlSearch=urlArrays[i]。匹配(//);
如果(urlSearch!=null){res_url.push(urlSearch[1]);};
}
};
if(itemJson.length console.log('end na!!!您正在每行之后暂停流。您应该在es.split()
之后添加一个es.mapSync()
,以堆叠一批20个(您可以堆叠更多)暂停流之前的行数:出于好奇,处理您的一个文件需要多少时间?嗨@姗姗,谢谢!通常我的文件有100万行,流每秒处理1000-2000行。好的,您尝试过处理批处理吗?它会加快代码速度吗?@姗姗,是的!:)谢谢@姗姗!