为什么在Node.js中阻塞文件I/O时异步性能较差?
从阅读有关同步与异步的堆栈溢出的一些文章来看,异步应该具有较小的开销,或者在阻塞I/O操作方面比同步调用更快: 我调查过的一些地方: 我编写了一个小的基准测试,它可以生成4个256MB到1GB的文件,以查看fs.readFile的性能 输出:为什么在Node.js中阻塞文件I/O时异步性能较差?,node.js,asynchronous,fs,Node.js,Asynchronous,Fs,从阅读有关同步与异步的堆栈溢出的一些文章来看,异步应该具有较小的开销,或者在阻塞I/O操作方面比同步调用更快: 我调查过的一些地方: 我编写了一个小的基准测试,它可以生成4个256MB到1GB的文件,以查看fs.readFile的性能 输出: > makeFiles(); 512+0 records in 512+0 records out 536870912 bytes (537 MB, 512 MiB) copied, 4.28077 s, 125 MB/s 1024+0 reco
> makeFiles();
512+0 records in
512+0 records out
536870912 bytes (537 MB, 512 MiB) copied, 4.28077 s, 125 MB/s
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB, 1.0 GiB) copied, 8.45918 s, 127 MB/s
256+0 records in
256+0 records out
268435456 bytes (268 MB, 256 MiB) copied, 1.96678 s, 136 MB/s
512+0 records in
512+0 records out
536870912 bytes (537 MB, 512 MiB) copied, 4.32488 s, 124 MB/s
undefined
> syncTest();
Sync version took 1055.9131410121918ms
undefined
> asyncTest();
Promise { <pending> }
> Async version took 6991.523499011993ms
因此,异步版本似乎比同步版本慢约7倍。如何解释这种减速?什么时候应该使用同步版本
Repl.it链接:
系统:Arch linux 5.5.4-arch1-1上的节点13.9.0有关更快的版本,请参见下面对版本2的编辑 版本1 仅供参考,除了上面我的所有评论之外,以下是我最快获得异步版本的方法:
async function asyncTestStreamParallel(files) {
const startTime = performance.now();
let results = [];
for (let filename of files) {
results.push(new Promise((resolve, reject) => {
const stream = fs.createReadStream(filename, {highWaterMark: 64 * 1024 * 10});
const data = [];
stream.on('data', chunk => {
data.push(chunk);
}).on('end', () => {
resolve(Buffer.concat(data));
}).on('error', reject);
}));
}
await Promise.all(results);
console.log(`Async stream parallel version took ${performance.now() - startTime}ms`);
}
结果如下:
下面是我在Windows 10节点v12.13.1上的结果:
node --expose_gc temp
Sync version took 1175.2680000066757ms
Async version took 2315.0439999699593ms
Async stream version took 1600.0085990428925ms
Async stream parallel version took 1111.310200035572ms
Async serial version took 4387.053400993347ms
注意,我稍微修改了方案,将文件名数组传递到每个测试中,而不是每次都创建文件名,以便集中创建文件
帮助我加快速度的是:
使用较大的highWaterMark(可能是流缓冲区大小)
在一个数组中收集数据,然后在最后连接它,这大大减少了峰值内存消耗和GC工作。
允许循环中的不同文件彼此并行运行
通过这些更改,它的速度与同步版本大致相同,有时稍慢,有时大致相同
我还在每个测试运行之间设置了2秒的延迟,并强制运行垃圾收集器,以确保GC运行不会干扰我的结果
这是我的整个脚本,可以在任何平台上运行。请注意,必须使用-expose_gc命令行参数,如node-expose_gc temp.js中所示:
以及与这些匹配的代码:
// Run this with the --expose_gc command line option
const {performance} = require('perf_hooks');
const fs = require('fs');
const fsp = fs.promises;
const path = require('path')
const sizes = [512, 1024, 256, 512]; // file sizes in MB
const data = "0123456789\n";
const testDir = path.join(__dirname, "bigfile");
function makeFiles() {
// make a bigger string to make fewer disk writes
const bData = [];
for (let i = 0; i < 1000; i++) {
bData.push(data);
}
const biggerData = bData.join("");
try {
fs.mkdirSync(testDir); // ignore errors if it already exists
} catch(e) {
// do nothing if it already exists
}
const files = [];
for (let i = 0; i < sizes.length; i++) {
let targetLen = sizes[i] * 1024 * 1024;
let f;
try {
let fname = `${path.join(testDir, "test")}-${i}.txt`;
f = fs.openSync(fname, 'w');
files.push(fname);
let len = 0;
while (len < targetLen) {
fs.writeSync(f, biggerData);
len += biggerData.length;
}
} catch(e) {
console.log(e);
process.exit(1);
} finally {
if (f) fs.closeSync(f);
}
}
return files;
}
function clearFiles(files) {
for (let filename of files) {
fs.unlinkSync(filename);
}
fs.rmdirSync(testDir);
}
function readFileSync(filename) {
let handle = fs.openSync(filename, "r");
try {
let stats = fs.fstatSync(handle);
let buffer = Buffer.allocUnsafe(stats.size);
let bytesRead = fs.readSync(handle, buffer, 0, stats.size, 0);
if (bytesRead !== stats.size) {
throw new Error("bytesRead not full file size")
}
} finally {
fs.closeSync(handle);
}
}
// read a file in one single read
async function readFile(filename) {
let handle = await fsp.open(filename, "r");
try {
let stats = await handle.stat();
let buffer = Buffer.allocUnsafe(stats.size);
let {bytesRead} = await handle.read(buffer, 0, stats.size, 0);
if (bytesRead !== stats.size) {
throw new Error("bytesRead not full file size")
}
} finally {
handle.close()
}
}
function syncTest(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
results.push(fs.readFileSync(filename));
}
console.log(`Sync version took ${performance.now() - startTime}ms`);
}
function syncTestSingleRead(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
readFileSync(filename);
}
console.log(`Sync single read version took ${performance.now() - startTime}ms`);
}
async function asyncTest(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
results.push(fs.promises.readFile(filename));
}
await Promise.all(results);
console.log(`Async version took ${performance.now() - startTime}ms`);
}
async function asyncTestStream(files) {
const startTime = performance.now();
for (let filename of files) {
await new Promise((resolve, reject) => {
let stream = fs.createReadStream(filename, {highWaterMark: 64 * 1024 * 10});
let data = [];
stream.on('data', chunk => {
data.push(chunk);
}).on('close', () => {
resolve(Buffer.concat(data));
}).on('error', reject);
});
}
console.log(`Async stream version took ${performance.now() - startTime}ms`);
}
async function asyncTestStreamParallel(files) {
const startTime = performance.now();
let results = [];
for (let filename of files) {
results.push(new Promise((resolve, reject) => {
const stream = fs.createReadStream(filename, {highWaterMark: 64 * 1024 * 100});
const data = [];
stream.on('data', chunk => {
data.push(chunk);
}).on('end', () => {
resolve(Buffer.concat(data));
}).on('error', reject);
}));
}
await Promise.all(results);
console.log(`Async stream parallel version took ${performance.now() - startTime}ms`);
}
async function asyncTestSingleReadSerial(files) {
const startTime = performance.now();
let buffer;
for (let filename of files) {
let handle = await fsp.open(filename, "r");
try {
let stats = await handle.stat();
if (!buffer || buffer.length < stats.size) {
buffer = Buffer.allocUnsafe(stats.size);
}
let {bytesRead} = await handle.read(buffer, 0, stats.size, 0);
if (bytesRead !== stats.size) {
throw new Error("bytesRead not full file size")
}
} finally {
handle.close()
}
}
console.log(`Async single read serial version took ${performance.now() - startTime}ms`);
}
async function asyncTestSingleReadParallel(files) {
const startTime = performance.now();
await Promise.all(files.map(readFile));
console.log(`Async single read parallel version took ${performance.now() - startTime}ms`);
}
async function asyncTestSerial(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
results.push(await fs.promises.readFile(filename));
}
console.log(`Async serial version took ${performance.now() - startTime}ms`);
}
function delay(t) {
return new Promise(resolve => {
global.gc();
setTimeout(resolve, t);
});
}
// delay between each test to let any system stuff calm down
async function run() {
const files = makeFiles();
try {
await delay(2000);
syncTest(files);
await delay(2000);
syncTestSingleRead(files);
await delay(2000);
await asyncTest(files)
await delay(2000);
await asyncTestSerial(files);
await delay(2000);
await asyncTestStream(files);
await delay(2000);
await asyncTestStreamParallel(files);
await delay(2000);
await asyncTestSingleReadSerial(files);
await delay(2000);
await asyncTestSingleReadParallel(files);
} catch(e) {
console.log(e);
} finally {
clearFiles(files);
}
}
run();
有关更快的版本,请参见下面版本2的编辑 版本1 仅供参考,除了上面我的所有评论之外,以下是我最快获得异步版本的方法:
async function asyncTestStreamParallel(files) {
const startTime = performance.now();
let results = [];
for (let filename of files) {
results.push(new Promise((resolve, reject) => {
const stream = fs.createReadStream(filename, {highWaterMark: 64 * 1024 * 10});
const data = [];
stream.on('data', chunk => {
data.push(chunk);
}).on('end', () => {
resolve(Buffer.concat(data));
}).on('error', reject);
}));
}
await Promise.all(results);
console.log(`Async stream parallel version took ${performance.now() - startTime}ms`);
}
结果如下:
下面是我在Windows 10节点v12.13.1上的结果:
node --expose_gc temp
Sync version took 1175.2680000066757ms
Async version took 2315.0439999699593ms
Async stream version took 1600.0085990428925ms
Async stream parallel version took 1111.310200035572ms
Async serial version took 4387.053400993347ms
注意,我稍微修改了方案,将文件名数组传递到每个测试中,而不是每次都创建文件名,以便集中创建文件
帮助我加快速度的是:
使用较大的highWaterMark(可能是流缓冲区大小)
在一个数组中收集数据,然后在最后连接它,这大大减少了峰值内存消耗和GC工作。
允许循环中的不同文件彼此并行运行
通过这些更改,它的速度与同步版本大致相同,有时稍慢,有时大致相同
我还在每个测试运行之间设置了2秒的延迟,并强制运行垃圾收集器,以确保GC运行不会干扰我的结果
这是我的整个脚本,可以在任何平台上运行。请注意,必须使用-expose_gc命令行参数,如node-expose_gc temp.js中所示:
以及与这些匹配的代码:
// Run this with the --expose_gc command line option
const {performance} = require('perf_hooks');
const fs = require('fs');
const fsp = fs.promises;
const path = require('path')
const sizes = [512, 1024, 256, 512]; // file sizes in MB
const data = "0123456789\n";
const testDir = path.join(__dirname, "bigfile");
function makeFiles() {
// make a bigger string to make fewer disk writes
const bData = [];
for (let i = 0; i < 1000; i++) {
bData.push(data);
}
const biggerData = bData.join("");
try {
fs.mkdirSync(testDir); // ignore errors if it already exists
} catch(e) {
// do nothing if it already exists
}
const files = [];
for (let i = 0; i < sizes.length; i++) {
let targetLen = sizes[i] * 1024 * 1024;
let f;
try {
let fname = `${path.join(testDir, "test")}-${i}.txt`;
f = fs.openSync(fname, 'w');
files.push(fname);
let len = 0;
while (len < targetLen) {
fs.writeSync(f, biggerData);
len += biggerData.length;
}
} catch(e) {
console.log(e);
process.exit(1);
} finally {
if (f) fs.closeSync(f);
}
}
return files;
}
function clearFiles(files) {
for (let filename of files) {
fs.unlinkSync(filename);
}
fs.rmdirSync(testDir);
}
function readFileSync(filename) {
let handle = fs.openSync(filename, "r");
try {
let stats = fs.fstatSync(handle);
let buffer = Buffer.allocUnsafe(stats.size);
let bytesRead = fs.readSync(handle, buffer, 0, stats.size, 0);
if (bytesRead !== stats.size) {
throw new Error("bytesRead not full file size")
}
} finally {
fs.closeSync(handle);
}
}
// read a file in one single read
async function readFile(filename) {
let handle = await fsp.open(filename, "r");
try {
let stats = await handle.stat();
let buffer = Buffer.allocUnsafe(stats.size);
let {bytesRead} = await handle.read(buffer, 0, stats.size, 0);
if (bytesRead !== stats.size) {
throw new Error("bytesRead not full file size")
}
} finally {
handle.close()
}
}
function syncTest(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
results.push(fs.readFileSync(filename));
}
console.log(`Sync version took ${performance.now() - startTime}ms`);
}
function syncTestSingleRead(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
readFileSync(filename);
}
console.log(`Sync single read version took ${performance.now() - startTime}ms`);
}
async function asyncTest(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
results.push(fs.promises.readFile(filename));
}
await Promise.all(results);
console.log(`Async version took ${performance.now() - startTime}ms`);
}
async function asyncTestStream(files) {
const startTime = performance.now();
for (let filename of files) {
await new Promise((resolve, reject) => {
let stream = fs.createReadStream(filename, {highWaterMark: 64 * 1024 * 10});
let data = [];
stream.on('data', chunk => {
data.push(chunk);
}).on('close', () => {
resolve(Buffer.concat(data));
}).on('error', reject);
});
}
console.log(`Async stream version took ${performance.now() - startTime}ms`);
}
async function asyncTestStreamParallel(files) {
const startTime = performance.now();
let results = [];
for (let filename of files) {
results.push(new Promise((resolve, reject) => {
const stream = fs.createReadStream(filename, {highWaterMark: 64 * 1024 * 100});
const data = [];
stream.on('data', chunk => {
data.push(chunk);
}).on('end', () => {
resolve(Buffer.concat(data));
}).on('error', reject);
}));
}
await Promise.all(results);
console.log(`Async stream parallel version took ${performance.now() - startTime}ms`);
}
async function asyncTestSingleReadSerial(files) {
const startTime = performance.now();
let buffer;
for (let filename of files) {
let handle = await fsp.open(filename, "r");
try {
let stats = await handle.stat();
if (!buffer || buffer.length < stats.size) {
buffer = Buffer.allocUnsafe(stats.size);
}
let {bytesRead} = await handle.read(buffer, 0, stats.size, 0);
if (bytesRead !== stats.size) {
throw new Error("bytesRead not full file size")
}
} finally {
handle.close()
}
}
console.log(`Async single read serial version took ${performance.now() - startTime}ms`);
}
async function asyncTestSingleReadParallel(files) {
const startTime = performance.now();
await Promise.all(files.map(readFile));
console.log(`Async single read parallel version took ${performance.now() - startTime}ms`);
}
async function asyncTestSerial(files) {
const startTime = performance.now();
const results = [];
for (let filename of files) {
results.push(await fs.promises.readFile(filename));
}
console.log(`Async serial version took ${performance.now() - startTime}ms`);
}
function delay(t) {
return new Promise(resolve => {
global.gc();
setTimeout(resolve, t);
});
}
// delay between each test to let any system stuff calm down
async function run() {
const files = makeFiles();
try {
await delay(2000);
syncTest(files);
await delay(2000);
syncTestSingleRead(files);
await delay(2000);
await asyncTest(files)
await delay(2000);
await asyncTestSerial(files);
await delay(2000);
await asyncTestStream(files);
await delay(2000);
await asyncTestStreamParallel(files);
await delay(2000);
await asyncTestSingleReadSerial(files);
await delay(2000);
await asyncTestSingleReadParallel(files);
} catch(e) {
console.log(e);
} finally {
clearFiles(files);
}
}
run();
同步文件I/O会阻止事件循环,因此在同步文件操作处理过程中,node.js不能执行任何其他操作。这将破坏服务器的可伸缩性和性能。这与单个操作的性能差异无关。这是关于允许node.js在文件I/O进行过程中执行其他操作,而不是阻止整个事件循环。因此,除了初始化/启动过程之外,同步I/O还有其他通用用例吗?在服务器中,这并不是因为它破坏了服务器的可伸缩性。但是,node.js有许多不是服务器的用途。例如,我有构建脚本和磁盘维护脚本,它们是单用户的,使用同步文件I/O编写和调试不太复杂。FYI,我将您的脚本移植到Windows,得到的同步版本为1502.4960010051727ms异步版本为2460.8494987198ms。仍然同步速度更快,但只快了63%,而不是700%。不确定运行它时发生了什么。仅供参考,我在Windows 10上运行的是node v12.13.1。同步文件I/O会阻止事件循环,以便node.js在处理同步文件操作时不能执行任何其他操作。这将破坏服务器的可伸缩性和性能。这与单个操作的性能差异无关。这是关于允许node.js在文件I/O进行过程中执行其他操作,而不是阻止整个事件循环。因此,除了初始化/启动过程之外,同步I/O还有其他通用用例吗?在服务器中,这并不是因为它破坏了服务器的可伸缩性。但是,node.js有许多不是服务器的用途。例如,我有构建脚本和磁盘维护脚本,它们是单用户的,编写起来就不那么复杂
使用同步文件I/O.FYI进行调试,我将您的脚本移植到Windows,得到的同步版本为1502.4960010051727ms异步版本为2460.8494987198ms。仍然同步速度更快,但只快了63%,而不是700%。不确定运行它时发生了什么。仅供参考,我在Windows 10上运行的是节点v12.13.1。