Javascript Node.js readStream用于大数据处理
我很难创建一个逐行node.js方法来处理大型nessus xml文件,而不需要高RAM使用率。在其当前的形式中,它正确地将数据保存在MongoDB中,但是RAM使用率不断增加,并且错误率超过了~1.5GB 我尝试在readStream上使用.pause(),但是,我一定是错误地实现了它,因为它似乎从未真正暂停流 代码如下:Javascript Node.js readStream用于大数据处理,javascript,node.js,mongodb,Javascript,Node.js,Mongodb,我很难创建一个逐行node.js方法来处理大型nessus xml文件,而不需要高RAM使用率。在其当前的形式中,它正确地将数据保存在MongoDB中,但是RAM使用率不断增加,并且错误率超过了~1.5GB 我尝试在readStream上使用.pause(),但是,我一定是错误地实现了它,因为它似乎从未真正暂停流 代码如下: // LR.JS Imports var fs = require('fs'); var readline = require('readline'); var strea
// LR.JS Imports
var fs = require('fs');
var readline = require('readline');
var stream = require('stream');
var instream = fs.createReadStream('test.nessus');
var outstream = new stream;
var rl = readline.createInterface(instream, outstream);
var buff = false;
var parseString = require('xml2js').parseString;
var buffStream = '';
//Mongoose Imports
var mongoose = require('mongoose');
var ReportHostDoc = require('./schemas/report-host.model.js');
var ReportItemDoc = require('./schemas/report-item.model.js');
var PluginDetailDoc = require('./schemas/plugin-detail.model.js');
mongoose.Promise = require('bluebird');
// Mongoose Connect
mongoose.connect('mongodb://localhost/test');
var db = mongoose.connection;
db.on('error', console.error.bind(console, 'connection error:'));
db.once('open', () => {
// Create counters for _taskCheck
var reportHostCounter = 0;
var reportHostSaveCounter = 0;
rl.on('line', (line) => {
// process line here
if (/[<]ReportHost/.test(line)) {
buff = true;
reportHostCounter++;
}
if (buff == true) {
buffStream += line + '\n';
}
if (/[<][/]ReportHost/i.test(line)) {
buff = false; // changed to = not == 9/6
// XML2JS Parse ReportHost Buffstream
parseString(buffStream, (err, result) => {
// Loop through ReportHost properties to reliably find IP
var reportHostIP = '';
var reportHostOS = '';
result.ReportHost.HostProperties[0].tag.forEach((entry) => {
if (entry.$.name === 'host-ip') {
reportHostIP = entry._;
}
if (entry.$.name === 'operating-system') {
reportHostOS = entry._;
}
});
// Save Report Host Document
var host = new ReportHostDoc({
hostname: result.ReportHost.$.name,
ip: reportHostIP,
os: reportHostOS,
high: 0,
critical: 0
});
// Process Each Report Item
result.ReportHost.ReportItem.forEach((entry) => {
var cvssScore = '';
if (entry.cvss_base_score) {
cvssScore = JSON.stringify(entry.cvss_base_score).slice(2, 5)
} else {
cvssScore = 0;
}
var item = new ReportItemDoc({
itemName: entry.$.pluginName,
pluginID: entry.$.pluginID,
ipAddress: reportHostIP,
exploitAvailable: entry.exploit_available,
cvssBaseScore: cvssScore,
pluginPublishedDate: entry.plugin_publication_date,
pluginModifiedDate: entry.plugin_modification_date,
description: entry.description
})
if (item.cvssBaseScore >= 7 && item.cvssBaseScore < 10) {
host.high++;
}
if (item.cvssBaseScore == 10) {
host.critical++;
}
item.save((err, item) => {
if (err) return console.log(err);
})
});
host.save((err, host) => {
if (err) return console.log(err);
reportHostSaveCounter++;
});
})
buffStream = ''; // Empty buffer for next report host
}
});
rl.on('close', () => { // Read Stream Finished
console.log('Log Parse finished!');
var _taskCheck = setInterval(() => { // Async loop waits for all tasks to finish
if (reportHostCounter == reportHostSaveCounter) {
clearInterval(_taskCheck);
var pluginCounter = 0;
var pluginSaveCounter = 0;
ReportItemDoc.distinct('pluginID', (err, ids) => {
ids.forEach((id) => {
pluginCounter++;
ReportItemDoc.findOne({
'pluginID': id
}, (err, plugin) => {
ReportItemDoc.count({
'pluginID': id
}, (err, count) => {
var pluginSeverity = '';
var cvss = plugin.cvssBaseScore;
if (cvss >= 7 && cvss < 10) {
pluginSeverity = 'High';
}
if (cvss == 10) {
pluginSeverity = 'Critical';
}
item = new PluginDetailDoc({
pluginName: plugin.itemName,
pluginID: id,
severity: pluginSeverity,
quantity: count,
description: plugin.description
})
item.save((err, host) => {
if (err) return console.log(err);
pluginSaveCounter++;
});
})
});
})
})
var _pluginTaskCheck = setInterval(() => { // Async loop waits for all tasks to finish
if (pluginCounter == pluginSaveCounter) {
clearInterval(_pluginTaskCheck);
mongoose.connection.close();
}
}, 100);
}
}, 100);
});
});
//LR.JS导入
var fs=需要('fs');
var readline=require('readline');
var stream=require(‘stream’);
var instream=fs.createReadStream('test.nessus');
var外流=新流;
var rl=readline.createInterface(流内、流外);
var buff=false;
var parseString=require('xml2js')。parseString;
var buffStream='';
//猫鼬进口
var mongoose=require('mongoose');
var ReportHostDoc=require('./schemas/report-host.model.js');
var ReportItemDoc=require('./schemas/report item.model.js');
var PluginDetailDoc=require('./schemas/plugindetail.model.js');
mongoose.Promise=require('bluebird');
//猫鼬连接
猫鼬mongodb://localhost/test');
var db=猫鼬连接;
db.on('error',console.error.bind(console,'connectionerror:');
db.one('打开',()=>{
//为_taskCheck创建计数器
var reportHostCounter=0;
var reportHostSaveCounter=0;
rl.on('行',(行)=>{
//这里是生产线
如果(/[{
if(条目.$.name==='主机ip'){
reportHostIP=条目;
}
if(条目.$.name===‘操作系统’){
reportHostOS=条目;
}
});
//保存报表宿主文档
var host=new ReportHostDoc({
主机名:result.ReportHost.$.name,
ip:reportHostIP,
os:reportHostOS,
高:0,,
临界值:0
});
//处理每个报告项
result.ReportHost.ReportItem.forEach((条目)=>{
var cvssScore='';
if(输入cvss\U基本分数){
cvssScore=JSON.stringify(entry.cvss_base_score).slice(2,5)
}否则{
cvssScore=0;
}
var item=新的ReportItemDoc({
itemName:entry.$.pluginName,
pluginID:entry.$.pluginID,
IP地址:reportHostIP,
利用可用:entry.exploit\u可用,
cvssBaseScore:CVSScore,
pluginPublishedDate:entry.plugin\u publication\u日期,
pluginModifiedDate:entry.plugin\u modification\u日期,
description:entry.description
})
如果(item.cvssBaseScore>=7和&item.cvssBaseScore<10){
host.high++;
}
如果(item.cvssBaseScore==10){
host.critical++;
}
item.save((错误,item)=>{
if(err)返回console.log(err);
})
});
host.save((错误,主机)=>{
if(err)返回console.log(err);
reportHostSaveCounter++;
});
})
buffStream='';//下一个报表主机的空缓冲区
}
});
rl.on('close',()=>{//读取流已完成
log('log Parse finished!');
var_taskCheck=setInterval(()=>{//Async循环等待所有任务完成
if(reportHostCounter==reportHostSaveCounter){
清除间隔(_taskCheck);
var-pluginCounter=0;
var-pluginSaveCounter=0;
ReportItemDoc.distinct('pluginID',(err,ids)=>{
id.forEach((id)=>{
pluginCounter++;
ReportItemDoc.findOne({
“pluginID”:id
},(错误,插件)=>{
ReportItemDoc.count({
“pluginID”:id
},(错误,计数)=>{
var pluginSeverity='';
var cvss=plugin.cvssBaseScore;
如果(cvss>=7&&cvss<10){
插件验证=‘高’;
}
如果(cvss==10){
pluginSeverity=‘临界’;
}
项目=新的PluginDetailDoc({
pluginName:plugin.itemName,
pluginID:id,
严重性:插件验证,
数量:计数,
description:plugin.description
})
item.save((错误,主机)=>{
if(err)返回console.log(err);
pluginSaveCounter++;
});
})
});
})
})
var_pluginTaskCheck=setInterval(()=>{//Async循环等待所有任务完成
if(pluginCounter==pluginSaveCounter){
清除间隔(_pluginTaskCheck);
mongoose.connection.close();
}
}, 100);
}
}, 100);
});
});
我建议放弃当前的计划,使用一个能为您完成所有这些的XML流解析器。我不能推荐一个特定的模块……有几个……但它们非常擅长您所做的工作,从XML流中提取单个对象。感谢您的快速回复!我实际上使用的是XML解析器。问题是,我无法将整个文件解析为它,因为文件太大了。我正在异步地逐段解析每个元素。但是,很明显,我正在以一种高效的方式进行解析。不仅仅是一个XML解析器……一个流式XML解析器。我建议放弃当前的计划,并检查一个为您提供所有这些功能的流式XML解析器。我无法推荐d一个特定的模块…有一些…但是他们非常擅长你所做的,从XML流中提取单个对象。感谢你的快速回复!我实际上使用的是XML解析器。问题是,我无法将整个文件解析成它,因为文件太大了。我异步地逐段解析每个元素。但是,显然我我的工作效率很高。不仅仅是一个XML解析器……一个流式XML解析器。