在JavaScript中同时读取和匹配多个巨大的JSON/NDJSON文件
有两个文件都是按id排序的,分别是parent_data.json和child_data.json。孩子是父母的一个子集 我正在尝试将parent.id与child.id匹配。我正在尝试对每个.json文件执行一次过程。而不是像嵌套的forEach/map那样遍历每个子对象 假设这两个.json文件都包含很少的gig/数百万行,重新读取children.json需要很长时间,需要多次传递,并且将所有内容加载到内存中也是不可能的 每个文件可以是格式,也可以是格式。当前示例是NDJSON parent_data.json:在JavaScript中同时读取和匹配多个巨大的JSON/NDJSON文件,javascript,node.js,json,big-o,Javascript,Node.js,Json,Big O,有两个文件都是按id排序的,分别是parent_data.json和child_data.json。孩子是父母的一个子集 我正在尝试将parent.id与child.id匹配。我正在尝试对每个.json文件执行一次过程。而不是像嵌套的forEach/map那样遍历每个子对象 假设这两个.json文件都包含很少的gig/数百万行,重新读取children.json需要很长时间,需要多次传递,并且将所有内容加载到内存中也是不可能的 每个文件可以是格式,也可以是格式。当前示例是NDJSON paren
{"id":1, "name": "parent_1"}
{"id":2, "name": "parent_2"}
{"id":3, "name": "parent_3"}
{"id":4, "name": "parent_4"}
{"id":5, "name": "parent_5"}
{"id":6, "name": "parent_6"}
{"id":7, "name": "parent_7"}
{"id":4, "name":"belongs_to_parent_4", "guid": "${unique_id}"}
{"id":4, "name":"belongs_to_parent_4", "guid": "${unique_id}"}
{"id":5, "name":"belongs_to_parent_5", "guid": "${unique_id}"}
{"id":7, "name":"belongs_to_parent_7", "guid": "${unique_id}"}
{"id":7, "name":"belongs_to_parent_7", "guid": "${unique_id}"}
child_data.json:
{"id":1, "name": "parent_1"}
{"id":2, "name": "parent_2"}
{"id":3, "name": "parent_3"}
{"id":4, "name": "parent_4"}
{"id":5, "name": "parent_5"}
{"id":6, "name": "parent_6"}
{"id":7, "name": "parent_7"}
{"id":4, "name":"belongs_to_parent_4", "guid": "${unique_id}"}
{"id":4, "name":"belongs_to_parent_4", "guid": "${unique_id}"}
{"id":5, "name":"belongs_to_parent_5", "guid": "${unique_id}"}
{"id":7, "name":"belongs_to_parent_7", "guid": "${unique_id}"}
{"id":7, "name":"belongs_to_parent_7", "guid": "${unique_id}"}
预期结果:
[
{
"id": 4,
"name": "parent_4",
"children": [
{
"id": 4,
"name": "belongs_to_parent_4",
"guid": "${unique_id}"
},
{
"id": 4,
"name": "belongs_to_parent_4",
"guid": "${unique_id}"
}
]
},
{
"id": 5,
"name": "parent_5",
"children": [
{
"id": 5,
"name": "belongs_to_parent_5",
"guid": "${unique_id}"
}
]
},
{
"id": 7,
"name": "parent_7",
"children": [
{
"id": 7,
"name": "belongs_to_parent_7",
"guid": "${unique_id}"
}
]
}
]
我找不到一个合适的方法来迭代这两个流而不在最终结果中丢失一些子流
此代码忽略每个父级的前两个子级:
const fs = require('fs');
const es = require('event-stream');
const main = () => {
let currentParent = null;
let currentChild = null;
let prevChild = null;
let parentLines = 0;
let childLines = 0;
let tmpChildren = [];
const parentStream = fs
.createReadStream('parent_data.json') // Parent Stream
.pipe(es.split('\n')) // Delimit by \n
.pipe(
es.mapSync(line => { // Current line, without the delimiter \n
parentStream.pause(); // Pause stream until done processing this line
++parentLines; // Debug
currentParent = JSON.parse(line); // Now it's valid JSON object
console.log('currentParent', currentParent.id); // Debug
}),
)
.on('error', e => {
console.error(e);
})
.on('close', () => {
console.log('close reading parent stream');
})
.on('end', () => {
console.log('end reading parent stream');
});
const childStream = fs
.createReadStream('child_data.json') // Child Stream
.pipe(es.split('\n')) // Split by delimiter
.pipe(
es.mapSync(line => { // Current child line, without delimiter
++childLines; // Debug
childStream.pause(); // Pause child stream
currentChild = JSON.parse(line); // Valid JSON child now
if (prevChild && (currentParent.id === prevChild.id)) { // Check prevChild and currentParent
tmpChildren.push(prevChild);
prevChild = null;
}
if (currentChild && (currentParent.id == currentChild.id)) { // Check currentChild and currentParent
console.log('child', currentChild.id);
tmpChildren.push(currentChild);
// childStream.resume(); // Having this here will cause the stream to stop processing entirely
} else {
// We're here because currentParent does not match currentChild, move to next parent
prevChild = currentChild;
if (tmpChildren.length > 0) {
currentParent['children'] = tmpChildren;
tmpChildren = [];
}
parentStream.resume();
}
console.log('currentChild', currentChild.id);
childStream.resume(); // Having this here will cause the stream to skip children on new parent
}),
)
.on('error', e => {
console.error(e);
})
.on('close', () => {
console.log('close reading child stream');
})
.on('end', () => {
console.log('end reading child stream');
});
};
main();
TLDR:在child.id上匹配parent.id,如果不匹配,则保存child。迭代到下一个父级,检查parent.id上的prevChild.id和/或child.id。如果将append prevChild/child与parent[children]匹配,则迭代child。如果不匹配,则迭代到下一个父级…等等