Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/hibernate/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Javascript 在nodejs中读取和处理大文件(1-5gig)最有效的方法是什么?_Javascript_Node.js_Neo4j - Fatal编程技术网

Javascript 在nodejs中读取和处理大文件(1-5gig)最有效的方法是什么?

Javascript 在nodejs中读取和处理大文件(1-5gig)最有效的方法是什么?,javascript,node.js,neo4j,Javascript,Node.js,Neo4j,我有一个使用fs.createreadstream逐行处理大型xml文件并将其保存到数组的代码。如果数组计数达到限制(例如:20),我暂停createreadstream,然后使用async.queue处理该数组 // code var fs = require('fs'); var util = require('util'); var stream = require('stream'); var es = require("event-stream"); var async

我有一个使用fs.createreadstream逐行处理大型xml文件并将其保存到数组的代码。如果数组计数达到限制(例如:20),我暂停createreadstream,然后使用async.queue处理该数组

// code
var fs      = require('fs');
var util = require('util');
var stream = require('stream');
var es = require("event-stream");

var async   = require('async');
var request = require("request");
var neo4j = require('node-neo4j');
var exec    = require('child_process').exec;
var util = require('util');

var dbPediaExtractor = require('./dbPediaExtractor');
var dbpExt = new dbPediaExtractor();


var host = 'localhost';
var port = 7474;

var username = "neo4j";
var password = "12345";
var auth = "Basic " + new Buffer(username + ":" + password).toString("base64");
var httpUrlForTransaction = 'http://' + host + ':' + port + '/db/data/transaction/commit';

var limit = 200;
var concurrency = 200;
var action = process.argv[2];
var defLimit = process.argv[3];
var defConcurrency = process.argv[4];

var _ntfile = action.replace('.bz2','');

var queryArray = [];

function runCypherQuery(statements, callback) {

     var statements = { "statements": [ {
        "statement" : statements
      } ] };

    request.post({
        uri: httpUrlForTransaction,
        json: statements,
        headers : {
            "Authorization" : auth
        }
    },
    function (err, res, body) {
        callback(err, body);
    })
};

var carrier = function(item,cb) {

    var q = async.queue(function (task, callback) {

        dbpExt.run(task,function(err,querString){
            if(err) {
                callback();
            }else{
                queryArray.push(querString);
                callback();
            }
        });
    }, concurrency);

    q.drain = function() {
        var message = 'finished processing '+item.length+" number of queries ";
        message = message + "with concurrency of "+concurrency+" processe(s)";
        console.log(message);
        cb();
    };

    for(i in item) {
        q.push(item[i]);
    }
};


var exCallback = function(error, stdout, stderr){
    if (error !== null) {
        console.log("error: \n",error);
    } else {
        var counter = 0;
        var itemJson = [];

        rstream = fs.createReadStream(_ntfile)
        .pipe(es.split())
        .pipe(es.mapSync(function(line) {
            rstream.pause();
            line = line.toString();

            urlArrays = line.split(" ");
            var res_urls = new Array();
            if(urlArrays) {
                for(i in urlArrays){
                    var urlSearch = urlArrays[i].match(/<(.*?)>/);
                    if (urlSearch != null) { res_urls.push(urlSearch[1]); };
                }
            };

            if(itemJson.length < limit) {
                itemJson.push(res_urls[0]);
                rstream.resume();
            }else{
                console.log(itemJson);
                carrier(itemJson,function(err) {
                    if(err) {
                        console.log("reply status: ",err);
                    }

                    querString = queryArray.join(" ");
                    runCypherQuery(querString, function (err, resp) {
                        if (err) {
                            callback(err);
                        } else {
                            // resp = resp.strin
                            console.log(querString);
                            queryArray = [];
                            itemJson = [];
                            rstream.resume();
                        }
                    });
                });
            }
        })
        .on('error', function(err){
            console.log('Error while reading file.');
            console.log(err);
        })
        .on('end', function() {
            console.log('end na!!! <3');
            fs.unlink(_ntfile, function(err){
                if(err){
                    console.log('file was not deleted');
                }else{
                    console.log('uploading of files is done');
                }
            });
        })
        .on('close', function() {
             console.log(itemJson);
             carrier(itemJson,function(err) {
                if(err) {
                    console.log("reply status: ",err);
                }

                querString = queryArray.join(" ");
                runCypherQuery(querString, function (err, resp) {
                    if (err) {
                        callback(err);
                    } else {
                        // resp = resp.strin
                        console.log(querString);
                        queryArray = [];
                    }
                });
                itemJson = [];
            });
        }));
    }
};

if(typeof(action) != "undefined") {

    if(typeof(defLimit) != 'undefined') {
        limit = (!isNaN(defLimit)) ? defLimit : limit;
    }

    if(typeof(defConcurrency) != 'undefined') {
        concurrency = (!isNaN(defConcurrency)) ? defConcurrency : concurrency;
    }

    var cli_code = "bzip2 -dk " + action;
    child = exec(cli_code,exCallback);
};
//代码
var fs=需要('fs');
var util=require('util');
var stream=require(‘stream’);
var es=要求(“事件流”);
var async=require('async');
var请求=要求(“请求”);
var neo4j=require('node-neo4j');
var exec=require('child_process')。exec;
var util=require('util');
var dbPediaExtractor=require('./dbPediaExtractor');
var dbpExt=new dbPediaExtractor();
var host='localhost';
var端口=7474;
var username=“neo4j”;
var password=“12345”;
var auth=“Basic”+新缓冲区(用户名+:“+密码).toString(“base64”);
var httpUrlForTransaction='http://'+host+':'+port+'/db/data/transaction/commit';
var限值=200;
var并发=200;
var action=process.argv[2];
var defLimit=process.argv[3];
var defConcurrency=process.argv[4];
var _ntfile=action.replace('.bz2','');
变量queryArray=[];
函数runCypherQuery(语句、回调){
var语句={“语句”:[{
“声明”:声明
} ] };
请寄({
uri:httpUrlForTransaction,
json:语句,
标题:{
“授权”:授权
}
},
功能(错误、恢复、正文){
回调(err,body);
})
};
var载体=功能(项目,cb){
var q=async.queue(函数(任务、回调){
run(任务,函数(err,querystring){
如果(错误){
回调();
}否则{
queryArray.push(querystring);
回调();
}
});
},并发性);
q、 drain=函数(){
var message='finished processing'+item.length+“查询数”;
message=message+,并发性为“+concurrency+”进程;
控制台日志(消息);
cb();
};
第(i)项{
q、 推动(第[i]项);
}
};
var exCallback=函数(错误、标准输出、标准输出){
如果(错误!==null){
日志(“错误:\n”,错误);
}否则{
var计数器=0;
var itemJson=[];
rstream=fs.createReadStream(\u ntfile)
.pipe(es.split())
.pipe(es.mapSync(函数(行){
restream.pause();
line=line.toString();
urlArrays=line.split(“”);
var res_url=新数组();
如果(URL数组){
for(URL数组中的i){
var urlSearch=urlArrays[i]。匹配(//);
如果(urlSearch!=null){res_url.push(urlSearch[1]);};
}
};
if(itemJson.lengthconsole.log('end na!!!您正在每行之后暂停流。您应该在
es.split()
之后添加一个
es.mapSync()
,以堆叠一批20个(您可以堆叠更多)暂停流之前的行数:出于好奇,处理您的一个文件需要多少时间?嗨@姗姗,谢谢!通常我的文件有100万行,流每秒处理1000-2000行。好的,您尝试过处理批处理吗?它会加快代码速度吗?@姗姗,是的!:)谢谢@姗姗!