Javascript 使用node.js中的流将大量字符串从内存写入文件，导致高RAM使用率问题_Javascript_Node.js

Javascript 使用node.js中的流将大量字符串从内存写入文件，导致高RAM使用率问题

javascript node.js

Javascript 使用node.js中的流将大量字符串从内存写入文件，导致高RAM使用率问题,javascript,node.js,Javascript,Node.js,我尝试使用Node.js streams将数百万个字符串写入一个文件，但在此过程中，RAM使用量增加到800MB： const fs = require('fs') const walkdir = require('walkdir') let options = { "max_depth": 0, "track_inodes": true } let dir = "C:/" let paths = walkdir(dir, options) var wstream = fs.crea

我尝试使用Node.js streams将数百万个字符串写入一个文件，但在此过程中，RAM使用量增加到800MB：

const fs = require('fs')
const walkdir = require('walkdir')

let options = {
  "max_depth": 0,
  "track_inodes": true
}

let dir = "C:/"
let paths = walkdir(dir, options)
var wstream = fs.createWriteStream('C:/test/file.txt')
wstream.write('[')

paths.on('path', function(path, stat) {
  wstream.write(`"${path}",`)
})

paths.on('end', function(path, stat) {
  wstream.write(']')
  wstream.end()

  // Compressing the file after it's written:
  const gzip = require('zlib').createGzip()
  const inp = fs.createReadStream('C:/test/file.txt')
  const out = fs.createWriteStream('C:/test/file.txt.gz')
  inp.pipe(gzip).pipe(out)
})

我也试着这样写文件：

...
paths.on('path', function(path, stat) {
  fs.writeFileSync('C:/test/file.txt', path)
})
...

我还尝试了

sync

：

walkdir.sync(dir, options, callback)

function callback(path) {
  let res = wstream.write(`"${path}",`)
  if (!res) {
    wstream.once('drain', callback)
  }
  else {
    callback()
  }
}

但这两种方法都会产生相同的结果，RAM的使用量会上升到500-800MB

我还尝试了以下方法，RAM的使用量始终保持在~100MB，但实际上不起作用，它将412kb写入文件，然后继续使用CPU，但什么也没有发生（其他方法在1-2分钟内完成文件写入）

问题

如何确保流按预期工作（内存使用率低）
在编写过程中如何压缩（gzip）文件？或者我只能在写完后再做

这是因为你可以毫无限制地异步做事。每个路径都将为

路径创建一个新事件。在（'path'，…）

上，因此所有路径加载到事件循环的速度要比处理它们的速度快得多，因此内存中会出现峰值。您需要限制一次写入的路径数量

您可以使用

walkdir.sync

对其进行限制，但这意味着您一次只能处理一条路径。此外，根据实现方式的不同，最终发现路径的速度可能比写入流的速度还要快

一个更灵活的解决方案是跟踪正在处理的并发路径的数量，并在达到限制后暂停流

const fs = require('fs')
const walkdir = require('walkdir')

let options = {
  "max_depth": 0,
  "track_inodes": true
}

let dir = "C:/"
let paths = walkdir(dir, options)
var wstream = fs.createWriteStream('C:/test/file.txt')
wstream.write('[')

const maxPaths = 20; // Maximum amount of concurrent paths allowed to process
let currentPaths = 0; // Current amount of concurrent paths being processed
let deferredPaths = []; // If we somehow exceed the limit, store the excess paths here for later processing. This might not be necessary, depending on how walkdir implements their pause function

const finishPathFlush = () => {
  if (deferredPaths.length > 0) {
    // Process any paths in the deferred queue
    wstream.write('"' + deferredPaths.pop() + '",', finishPathFlush);
  } else {
    // No more work to do, resume walkdir
    --currentPaths;
    paths.resume();
  }
}

paths.on('path', function(path, stat) {
  if (currentPaths < maxPaths) {
    // We have room to process this path
    if (++currentPaths === maxPaths) {
      // If we reach the limit pause walkdir
      paths.pause();
    }
    wstream.write(`"${path}",`, finishPathFlush)
  } else {
    // Got too many paths, defer this path
    deferredPaths.push(path);
  }
})

paths.on('end', function(path, stat) {
  wstream.write(']')
  wstream.end()

  // Compressing the file after it's written:
  const gzip = require('zlib').createGzip()
  const inp = fs.createReadStream('C:/test/file.txt')
  const out = fs.createWriteStream('C:/test/file.txt.gz')
  inp.pipe(gzip).pipe(out)
})

const fs=require（'fs'））
const walkdir=require（'walkdir'）
让选项={
“最大深度”：0，
“跟踪索引节点”：正确
}
let dir=“C:/”
let path=walkdir（dir，选项）
var wstream=fs.createWriteStream（'C:/test/file.txt'）
wstream.write（“[”）
const maxPaths=20；//允许处理的最大并发路径数
设CurrentPath=0；//当前正在处理的并发路径数量
让deferredPaths=[]；//如果我们以某种方式超过了限制，请将多余的路径存储在此处以供以后处理。这可能不是必需的，具体取决于walkdir如何实现其暂停功能
const finishPathFlush=（）=>{
如果（deferredPaths.length>0）{
//处理延迟队列中的任何路径
wstream.write（“”+deferredPaths.pop（）+“，”，finishPathFlush）；
}否则{
//没有更多工作要做，请继续walkdir
--电流路径；
resume（）；
}
}
path.on（'path'，函数（path，stat）{
if（当前路径<最大路径）{
//我们有处理这条道路的空间
如果（++CurrentPath===MaxPath）{
//如果我们达到了极限，停下来
暂停（）；
}
wstream.write（`${path}，`，finishPathFlush）
}否则{
//路径太多，请推迟此路径
延迟路径。推送（路径）；
}
})
路径.on（'end'，函数（path，stat）{
wstream.write（']'）
wstream.end（）
//写入文件后压缩文件：
const gzip=require（'zlib'）。createGzip（）
const inp=fs.createReadStream（'C:/test/file.txt'）
const out=fs.createWriteStream（'C:/test/file.txt.gz'）
输入管道（gzip）.输出管道
})

您可以实现整个逻辑，而无需任何外部依赖项来查看优化的位置。以下是您可以调整的最小实现：

const fs = require('fs');
const path = require('path');
const zlib = require('zlib');
const stream = require('stream');

// Recursive walk file system
function walk(dir, str, busy) {
    busy.inc();
    fs.readdir(dir, (e, c) => {
        if (!e) {
            c.forEach(f => {
                const p = path.join(dir, f);
                busy.inc();
                fs.stat(p, (e, s) => {
                    if (!e && s.isDirectory()) {
                        walk(p, str, busy);
                    }
                    str.write(p + "\n");
                    busy.dec();
                });
            });
        }
        busy.dec();
    });
}

// Scan FS and write to file
async function scan(dir, dest) {
    return new Promise((resolve) => {
        const gzStr = zlib.createGzip();
        const destStr = fs.createWriteStream(dest);

        let count = 0;
        const busy = {
            inc: () => count++,
            dec: () => {
                count--;
                if (count < 1) {
                    process.nextTick(() => {
                        gzStr.end();
                        gzStr.once('finish', resolve);
                    });
                }
            }
        };

        walk(dir, gzStr, busy, resolve);
        gzStr.pipe(destStr);
    });
}

// Test above code
(async () => {
    // Save gzipped
    await scan(__dirname, './files.txt.gz');

    // Gunip to verify
    const unzipped = fs.createWriteStream('./files.txt');
    fs.createReadStream('./files.txt.gz').pipe(zlib.createGunzip()).pipe(unzipped);

    // End 
    unzipped.on('close', () => console.log('done'));
})();

const fs=require（'fs'）；
const path=require（'path'）；
const zlib=require（'zlib'）；
const stream=require（'stream'）；
//递归遍历文件系统
功能行走（直走、直走、忙碌）{
busy.inc（）；
fs.readdir（dir，（e，c）=>{
如果（！e）{
c、 forEach（f=>{
常数p=path.join（dir，f）；
busy.inc（）；
财政司司长（p，（e，s）=>{
如果（！e&s.isDirectory（））{
步行（p、str、忙碌）；
}
str.write（p+“\n”）；
busy.dec（）；
});
});
}
busy.dec（）；
});
}
//扫描FS并写入文件
异步函数扫描（dir、dest）{
返回新承诺（（解决）=>{
const gzStr=zlib.createGzip（）；
const destStr=fs.createWriteStream（dest）；
让计数=0；
常量忙={
inc:（）=>count++，
十二月：（）=>{
计数--；
如果（计数<1）{
process.nextTick（（）=>{
gzStr.end（）；
gzStr.once（'finish'，resolve）；
});
}
}
};
步行（dir、gzStr、busy、resolve）；
gzStr.管道（destStr）；
});
}
//测试上述代码
（异步（）=>{
//保存gzip文件
等待扫描（uu dirname，'./files.txt.gz'）；
//Gunip将进行验证
const unzip=fs.createWriteStream（'./files.txt'）；
fs.createReadStream（'./files.txt.gz'）.pipe（zlib.createGunzip（））.pipe（解压缩）；
//结束
解压缩.on（'close'，（）=>console.log（'done'）；
})();

节点在写入输出流时不会自动刷新输出流。使用支持

flush

方法的转换流来克服此问题的一个潜在解决方法是缺少文档示例：-（@traktor53是的，关于此方面的文档不足，我不确定如何创建转换。我尝试创建可读流

const readable=require（'stream'））.Readable

然后从该发射器内发送

路径

，然后尝试这样写

Readable.on（'data'，（path）=>{write HERE}

希望一旦它具有可读/可写性，它会自动将其排出。但还是不走运，我想我在节点中做得不对，目录遍历不是真正的“流”-到目前为止，npm中的所有walk dir模块都必须保留内部缓冲区。请参阅@S.D。感谢信息谢谢回答，不幸的是，两个建议的解决方案都不起作用。这很奇怪。如果我使用

walkdir.sync

它仍然使用800MB

const fs = require('fs');
const path = require('path');
const zlib = require('zlib');
const stream = require('stream');

// Recursive walk file system
function walk(dir, str, busy) {
    busy.inc();
    fs.readdir(dir, (e, c) => {
        if (!e) {
            c.forEach(f => {
                const p = path.join(dir, f);
                busy.inc();
                fs.stat(p, (e, s) => {
                    if (!e && s.isDirectory()) {
                        walk(p, str, busy);
                    }
                    str.write(p + "\n");
                    busy.dec();
                });
            });
        }
        busy.dec();
    });
}

// Scan FS and write to file
async function scan(dir, dest) {
    return new Promise((resolve) => {
        const gzStr = zlib.createGzip();
        const destStr = fs.createWriteStream(dest);

        let count = 0;
        const busy = {
            inc: () => count++,
            dec: () => {
                count--;
                if (count < 1) {
                    process.nextTick(() => {
                        gzStr.end();
                        gzStr.once('finish', resolve);
                    });
                }
            }
        };

        walk(dir, gzStr, busy, resolve);
        gzStr.pipe(destStr);
    });
}

// Test above code
(async () => {
    // Save gzipped
    await scan(__dirname, './files.txt.gz');

    // Gunip to verify
    const unzipped = fs.createWriteStream('./files.txt');
    fs.createReadStream('./files.txt.gz').pipe(zlib.createGunzip()).pipe(unzipped);

    // End 
    unzipped.on('close', () => console.log('done'));
})();