Javascript Node.js readStream用于大数据处理

Javascript Node.js readStream用于大数据处理,javascript,node.js,mongodb,Javascript,Node.js,Mongodb,我很难创建一个逐行node.js方法来处理大型nessus xml文件,而不需要高RAM使用率。在其当前的形式中,它正确地将数据保存在MongoDB中,但是RAM使用率不断增加,并且错误率超过了~1.5GB 我尝试在readStream上使用.pause(),但是,我一定是错误地实现了它,因为它似乎从未真正暂停流 代码如下: // LR.JS Imports var fs = require('fs'); var readline = require('readline'); var strea

我很难创建一个逐行node.js方法来处理大型nessus xml文件,而不需要高RAM使用率。在其当前的形式中,它正确地将数据保存在MongoDB中,但是RAM使用率不断增加,并且错误率超过了~1.5GB

我尝试在readStream上使用.pause(),但是,我一定是错误地实现了它,因为它似乎从未真正暂停流

代码如下:

// LR.JS Imports
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream('test.nessus');
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);
var buff = false;
var parseString = require('xml2js').parseString;
var buffStream = '';
//Mongoose Imports
var mongoose = require('mongoose');
var ReportHostDoc = require('./schemas/report-host.model.js');
var ReportItemDoc = require('./schemas/report-item.model.js');
var PluginDetailDoc = require('./schemas/plugin-detail.model.js');
mongoose.Promise = require('bluebird');
// Mongoose Connect
mongoose.connect('mongodb://localhost/test');
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error:'));
db.once('open', () => {
  // Create counters for _taskCheck
  var reportHostCounter = 0;
  var reportHostSaveCounter = 0;
  rl.on('line', (line) => {
    // process line here
    if (/[<]ReportHost/.test(line)) {
      buff = true;
      reportHostCounter++;
    }
    if (buff == true) {
      buffStream += line + '\n';
    }
    if (/[<][/]ReportHost/i.test(line)) {
      buff = false; // changed to = not == 9/6
      // XML2JS Parse ReportHost Buffstream
      parseString(buffStream, (err, result) => {
        // Loop through ReportHost properties to reliably find IP
        var reportHostIP = '';
        var reportHostOS = '';
        result.ReportHost.HostProperties[0].tag.forEach((entry) => {
          if (entry.$.name === 'host-ip') {
            reportHostIP = entry._;
          }
          if (entry.$.name === 'operating-system') {
            reportHostOS = entry._;
          }
        });
        // Save Report Host Document
        var host = new ReportHostDoc({
          hostname: result.ReportHost.$.name,
          ip: reportHostIP,
          os: reportHostOS,
          high: 0,
          critical: 0
        });
        // Process Each Report Item
        result.ReportHost.ReportItem.forEach((entry) => {
          var cvssScore = '';
          if (entry.cvss_base_score) {
            cvssScore = JSON.stringify(entry.cvss_base_score).slice(2, 5)
          } else {
            cvssScore = 0;
          }
          var item = new ReportItemDoc({
            itemName: entry.$.pluginName,
            pluginID: entry.$.pluginID,
            ipAddress: reportHostIP,
            exploitAvailable: entry.exploit_available,
            cvssBaseScore: cvssScore,
            pluginPublishedDate: entry.plugin_publication_date,
            pluginModifiedDate: entry.plugin_modification_date,
            description: entry.description
          })
          if (item.cvssBaseScore >= 7 && item.cvssBaseScore < 10) {
            host.high++;
          }
          if (item.cvssBaseScore == 10) {
            host.critical++;
          }
          item.save((err, item) => {
            if (err) return console.log(err);
          })
        });
        host.save((err, host) => {
          if (err) return console.log(err);
          reportHostSaveCounter++;
        });
      })
      buffStream = ''; // Empty buffer for next report host
    }
  });
  rl.on('close', () => { // Read Stream Finished
    console.log('Log Parse finished!');
    var _taskCheck = setInterval(() => { // Async loop waits for all tasks to finish
      if (reportHostCounter == reportHostSaveCounter) {
        clearInterval(_taskCheck);
        var pluginCounter = 0;
        var pluginSaveCounter = 0;
        ReportItemDoc.distinct('pluginID', (err, ids) => {
          ids.forEach((id) => {
            pluginCounter++;
            ReportItemDoc.findOne({
              'pluginID': id
            }, (err, plugin) => {
              ReportItemDoc.count({
                'pluginID': id
              }, (err, count) => {
                var pluginSeverity = '';
                var cvss = plugin.cvssBaseScore;
                if (cvss >= 7 && cvss < 10) {
                  pluginSeverity = 'High';
                }
                if (cvss == 10) {
                  pluginSeverity = 'Critical';
                }
                item = new PluginDetailDoc({
                  pluginName: plugin.itemName,
                  pluginID: id,
                  severity: pluginSeverity,
                  quantity: count,
                  description: plugin.description
                })
                item.save((err, host) => {
                  if (err) return console.log(err);
                  pluginSaveCounter++;
                });
              })
            });
          })
        })
        var _pluginTaskCheck = setInterval(() => { // Async loop waits for all tasks to finish
          if (pluginCounter == pluginSaveCounter) {
            clearInterval(_pluginTaskCheck);
            mongoose.connection.close();
          }
        }, 100);
      }
    }, 100);
  });
});
//LR.JS导入
var fs=需要('fs');
var readline=require('readline');
var stream=require(‘stream’);
var instream=fs.createReadStream('test.nessus');
var外流=新流;
var rl=readline.createInterface(流内、流外);
var buff=false;
var parseString=require('xml2js')。parseString;
var buffStream='';
//猫鼬进口
var mongoose=require('mongoose');
var ReportHostDoc=require('./schemas/report-host.model.js');
var ReportItemDoc=require('./schemas/report item.model.js');
var PluginDetailDoc=require('./schemas/plugindetail.model.js');
mongoose.Promise=require('bluebird');
//猫鼬连接
猫鼬mongodb://localhost/test');
var db=猫鼬连接;
db.on('error',console.error.bind(console,'connectionerror:');
db.one('打开',()=>{
//为_taskCheck创建计数器
var reportHostCounter=0;
var reportHostSaveCounter=0;
rl.on('行',(行)=>{
//这里是生产线
如果(/[{
if(条目.$.name==='主机ip'){
reportHostIP=条目;
}
if(条目.$.name===‘操作系统’){
reportHostOS=条目;
}
});
//保存报表宿主文档
var host=new ReportHostDoc({
主机名:result.ReportHost.$.name,
ip:reportHostIP,
os:reportHostOS,
高:0,,
临界值:0
});
//处理每个报告项
result.ReportHost.ReportItem.forEach((条目)=>{
var cvssScore='';
if(输入cvss\U基本分数){
cvssScore=JSON.stringify(entry.cvss_base_score).slice(2,5)
}否则{
cvssScore=0;
}
var item=新的ReportItemDoc({
itemName:entry.$.pluginName,
pluginID:entry.$.pluginID,
IP地址:reportHostIP,
利用可用:entry.exploit\u可用,
cvssBaseScore:CVSScore,
pluginPublishedDate:entry.plugin\u publication\u日期,
pluginModifiedDate:entry.plugin\u modification\u日期,
description:entry.description
})
如果(item.cvssBaseScore>=7和&item.cvssBaseScore<10){
host.high++;
}
如果(item.cvssBaseScore==10){
host.critical++;
}
item.save((错误,item)=>{
if(err)返回console.log(err);
})
});
host.save((错误,主机)=>{
if(err)返回console.log(err);
reportHostSaveCounter++;
});
})
buffStream='';//下一个报表主机的空缓冲区
}
});
rl.on('close',()=>{//读取流已完成
log('log Parse finished!');
var_taskCheck=setInterval(()=>{//Async循环等待所有任务完成
if(reportHostCounter==reportHostSaveCounter){
清除间隔(_taskCheck);
var-pluginCounter=0;
var-pluginSaveCounter=0;
ReportItemDoc.distinct('pluginID',(err,ids)=>{
id.forEach((id)=>{
pluginCounter++;
ReportItemDoc.findOne({
“pluginID”:id
},(错误,插件)=>{
ReportItemDoc.count({
“pluginID”:id
},(错误,计数)=>{
var pluginSeverity='';
var cvss=plugin.cvssBaseScore;
如果(cvss>=7&&cvss<10){
插件验证=‘高’;
}
如果(cvss==10){
pluginSeverity=‘临界’;
}
项目=新的PluginDetailDoc({
pluginName:plugin.itemName,
pluginID:id,
严重性:插件验证,
数量:计数,
description:plugin.description
})
item.save((错误,主机)=>{
if(err)返回console.log(err);
pluginSaveCounter++;
});
})
});
})
})
var_pluginTaskCheck=setInterval(()=>{//Async循环等待所有任务完成
if(pluginCounter==pluginSaveCounter){
清除间隔(_pluginTaskCheck);
mongoose.connection.close();
}
}, 100);
}
}, 100);
});
});

我建议放弃当前的计划,使用一个能为您完成所有这些的XML流解析器。我不能推荐一个特定的模块……有几个……但它们非常擅长您所做的工作,从XML流中提取单个对象。感谢您的快速回复!我实际上使用的是XML解析器。问题是,我无法将整个文件解析为它,因为文件太大了。我正在异步地逐段解析每个元素。但是,很明显,我正在以一种高效的方式进行解析。不仅仅是一个XML解析器……一个流式XML解析器。我建议放弃当前的计划,并检查一个为您提供所有这些功能的流式XML解析器。我无法推荐d一个特定的模块…有一些…但是他们非常擅长你所做的,从XML流中提取单个对象。感谢你的快速回复!我实际上使用的是XML解析器。问题是,我无法将整个文件解析成它,因为文件太大了。我异步地逐段解析每个元素。但是,显然我我的工作效率很高。不仅仅是一个XML解析器……一个流式XML解析器。