Javascript 如何在nodejs上下载大量文件
我有一个json文件,里面有大约20000到100000个链接 看起来像这样Javascript 如何在nodejs上下载大量文件,javascript,node.js,Javascript,Node.js,我有一个json文件,里面有大约20000到100000个链接 看起来像这样 [{ "file_name": "Blessed_Titles.jpg", "url": "https://i.imgur.com/FRDibHa.jpg", "downloadId": "6r44r4k340rvvr" }] 有没有办法一次并行下载大约100个? 下载1000个链接时,我会收到任何警告或错误吗? 现在我使用的是顺序下载,但我不确定它是否适合如此大量的链接 以下是我目前下载的
[{
"file_name": "Blessed_Titles.jpg",
"url": "https://i.imgur.com/FRDibHa.jpg",
"downloadId": "6r44r4k340rvvr"
}]
有没有办法一次并行下载大约100个?
下载1000个链接时,我会收到任何警告或错误吗?
现在我使用的是顺序下载,但我不确定它是否适合如此大量的链接
以下是我目前下载的方式
async function downloadALL(ultUrls) {
let i = 1;
const len = ultUrls.length;
for (ult of ultUrls) {
await downloadFile(ult, i, len)
.then(() => i++)
.catch(err => console.log(err));
}
}
function downloadFile(ult, i, len) {
return new Promise((resolve, reject, cb) => {
console.log('Downloading File: () ', ult.file_name);
const download = {
file: {},
};
let percentage = 0;
const percentage2 = ((i / len) * 100).toFixed(0);
download.file.name = ult.file_name;
download.file.percentage = percentage;
download.file.downloadId = ult.downloadId;
download.percentage = percentage2;
// console.log(download);
// let console_message = download;
let request = (ult.url.substr(0, 5) === 'https' ? https : http)
.get(ult.url, function(response) {
const lent = parseInt(response.headers['content-length'], 10);
let body = '';
let cur = 0;
const total = lent / 1048576; // 1048576 - bytes in 1Megabyte
response.on('data', function(chunk) {
body += chunk;
cur += chunk.length;
percentage = ((100.0 * cur) / lent).toFixed(0);
download.file.percentage = percentage;
mainWindow.webContents.send('download-info', download);
});
const file = utility.writeFile(ult.file_name, dir);
response.pipe(file);
file.on('error', function(err) {
console.log(`ERROR:${ err}`);
file.read();
});
file.on('finish', function() {
console.log('File downloaded');
return resolve(file.close(cb)); // close() is async, call cb after close completes.
});
})
.on('error', function(err) {
// Handle errors
return reject(err);
});
});
既然您提到了并行,NodeJS中通常的方法是使用子进程,并基于大量可用的计算资源生成多个并行线程 下面是一段伪代码,您可以参考它来创建解决方案
// parent.js
var child_process = require('child_process');
var numchild = require('os').cpus().length;
var done = 0;
var filesListJSON = [{
"file_name": "Blessed_Titles.jpg",
"url": "https://i.imgur.com/FRDibHa.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles2.jpg",
"url": "https://i.imgur.com/FRDibHa2.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles3.jpg",
"url": "https://i.imgur.com/FRDibHa3.jpg",
"downloadId": "6r44r4k340rvvr"
}];
// split the array into number of parallel threads avalible
var chunks = _.chunk(filesListJSON, numchild);
for (var i = 0; i < numchild; i++) {
var
var child = child_process.fork('./child');
//send the chunk of the list to respective thread.
child.send(chunks[i]);
//ps please check the count and logic for yourself I have not tested this.
child.on('message', function (message) {
console.log('[parent] received message from child:', message);
done++;
if (done === numchild) {
console.log('[parent] received all results');
...
}
});
}
// child.js
process.on('message', function (list) {
console.log('[child] received message from server:', message);
downloadFiles(list, function (done) {
console.log("Done downloading files : " + list.length);
process.send({
child: process.pid,
result: done
});
process.disconnect();
});
});
function downloadFiles(list, cb) {
//loop over list
//logic to download files
//cb(true)
}
//parent.js
var child_process=require('child_process');
var numchild=require('os').cpus().length;
var done=0;
var filesListJSON=[{
“文件名”:“祝福的标题.jpg”,
“url”:”https://i.imgur.com/FRDibHa.jpg",
“下载ID”:“6r44r4k340rvvr”
}, {
“文件名”:“祝福的标题2.jpg”,
“url”:”https://i.imgur.com/FRDibHa2.jpg",
“下载ID”:“6r44r4k340rvvr”
}, {
“文件名”:“祝福的标题3.jpg”,
“url”:”https://i.imgur.com/FRDibHa3.jpg",
“下载ID”:“6r44r4k340rvvr”
}];
//将阵列拆分为可用的并行线程数
var chunks=u.chunk(filelistJSON,numchild);
对于(变量i=0;i
有关所用逻辑的更多详细信息,请参阅
此外,我还使用了lodash库中的
chuck
函数来分割阵列进行处理 既然您提到了并行,NodeJS中通常的方法是使用子进程并基于大量可用的计算资源生成多个并行线程
下面是一段伪代码,您可以参考它来创建解决方案
// parent.js
var child_process = require('child_process');
var numchild = require('os').cpus().length;
var done = 0;
var filesListJSON = [{
"file_name": "Blessed_Titles.jpg",
"url": "https://i.imgur.com/FRDibHa.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles2.jpg",
"url": "https://i.imgur.com/FRDibHa2.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles3.jpg",
"url": "https://i.imgur.com/FRDibHa3.jpg",
"downloadId": "6r44r4k340rvvr"
}];
// split the array into number of parallel threads avalible
var chunks = _.chunk(filesListJSON, numchild);
for (var i = 0; i < numchild; i++) {
var
var child = child_process.fork('./child');
//send the chunk of the list to respective thread.
child.send(chunks[i]);
//ps please check the count and logic for yourself I have not tested this.
child.on('message', function (message) {
console.log('[parent] received message from child:', message);
done++;
if (done === numchild) {
console.log('[parent] received all results');
...
}
});
}
// child.js
process.on('message', function (list) {
console.log('[child] received message from server:', message);
downloadFiles(list, function (done) {
console.log("Done downloading files : " + list.length);
process.send({
child: process.pid,
result: done
});
process.disconnect();
});
});
function downloadFiles(list, cb) {
//loop over list
//logic to download files
//cb(true)
}
//parent.js
var child_process=require('child_process');
var numchild=require('os').cpus().length;
var done=0;
var filesListJSON=[{
“文件名”:“祝福的标题.jpg”,
“url”:”https://i.imgur.com/FRDibHa.jpg",
“下载ID”:“6r44r4k340rvvr”
}, {
“文件名”:“祝福的标题2.jpg”,
“url”:”https://i.imgur.com/FRDibHa2.jpg",
“下载ID”:“6r44r4k340rvvr”
}, {
“文件名”:“祝福的标题3.jpg”,
“url”:”https://i.imgur.com/FRDibHa3.jpg",
“下载ID”:“6r44r4k340rvvr”
}];
//将阵列拆分为可用的并行线程数
var chunks=u.chunk(filelistJSON,numchild);
对于(变量i=0;i
有关所用逻辑的更多详细信息,请参阅
此外,我还使用了lodash库中的
chuck
函数来分割阵列进行处理 我建议使用蓝鸟。
此Promise库具有批处理promises并发解决方案
// parent.js
var child_process = require('child_process');
var numchild = require('os').cpus().length;
var done = 0;
var filesListJSON = [{
"file_name": "Blessed_Titles.jpg",
"url": "https://i.imgur.com/FRDibHa.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles2.jpg",
"url": "https://i.imgur.com/FRDibHa2.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles3.jpg",
"url": "https://i.imgur.com/FRDibHa3.jpg",
"downloadId": "6r44r4k340rvvr"
}];
// split the array into number of parallel threads avalible
var chunks = _.chunk(filesListJSON, numchild);
for (var i = 0; i < numchild; i++) {
var
var child = child_process.fork('./child');
//send the chunk of the list to respective thread.
child.send(chunks[i]);
//ps please check the count and logic for yourself I have not tested this.
child.on('message', function (message) {
console.log('[parent] received message from child:', message);
done++;
if (done === numchild) {
console.log('[parent] received all results');
...
}
});
}
// child.js
process.on('message', function (list) {
console.log('[child] received message from server:', message);
downloadFiles(list, function (done) {
console.log("Done downloading files : " + list.length);
process.send({
child: process.pid,
result: done
});
process.disconnect();
});
});
function downloadFiles(list, cb) {
//loop over list
//logic to download files
//cb(true)
}
这是他们教程的链接:
以下是针对您的案例的蓝鸟代码解决方案:
// don't forget to run npm install bluebird first
const Promise = require('bluebird');
async function downloadAll(ultUrls) {
// The concurrency property here represents the number of promises that will be allowed to run at the same time
// You can surround this line with try/catch scope if you want to
await Promise.map(ultUrls, downloadFile, {concurrency: 100});
}
// Here you no longer need thew i and len parameters
function downloadFile() {
// Code change needed here stop using the i and len parameters
}我建议使用蓝鸟。 此Promise库具有批处理promises并发解决方案
// parent.js
var child_process = require('child_process');
var numchild = require('os').cpus().length;
var done = 0;
var filesListJSON = [{
"file_name": "Blessed_Titles.jpg",
"url": "https://i.imgur.com/FRDibHa.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles2.jpg",
"url": "https://i.imgur.com/FRDibHa2.jpg",
"downloadId": "6r44r4k340rvvr"
}, {
"file_name": "Blessed_Titles3.jpg",
"url": "https://i.imgur.com/FRDibHa3.jpg",
"downloadId": "6r44r4k340rvvr"
}];
// split the array into number of parallel threads avalible
var chunks = _.chunk(filesListJSON, numchild);
for (var i = 0; i < numchild; i++) {
var
var child = child_process.fork('./child');
//send the chunk of the list to respective thread.
child.send(chunks[i]);
//ps please check the count and logic for yourself I have not tested this.
child.on('message', function (message) {
console.log('[parent] received message from child:', message);
done++;
if (done === numchild) {
console.log('[parent] received all results');
...
}
});
}
// child.js
process.on('message', function (list) {
console.log('[child] received message from server:', message);
downloadFiles(list, function (done) {
console.log("Done downloading files : " + list.length);
process.send({
child: process.pid,
result: done
});
process.disconnect();
});
});
function downloadFiles(list, cb) {
//loop over list
//logic to download files
//cb(true)
}
这是他们教程的链接:
以下是针对您的案例的蓝鸟代码解决方案:
// don't forget to run npm install bluebird first
const Promise = require('bluebird');
async function downloadAll(ultUrls) {
// The concurrency property here represents the number of promises that will be allowed to run at the same time
// You can surround this line with try/catch scope if you want to
await Promise.map(ultUrls, downloadFile, {concurrency: 100});
}
// Here you no longer need thew i and len parameters
function downloadFile() {
// Code change needed here stop using the i and len parameters
}您是如何下载这些文件的。请将相关代码发布到我已经更新了我的问题你是如何下载文件的。请将相关代码发布到我已经更新了我的问题。不过,我更喜欢集群,并且您应该使用尽可能多的分叉,以使所有内核同时工作(在处理器的内核之间分配工作)。但你的答案似乎不是这样的,当你的cpu只有4个内核时,你不能只分叉1000个进程,这最终会使你的系统超慢。好吧,我不是在建议这样做。上面的逻辑考虑了可用内核的数量,并划分了可用的工作总量,并将其委托给派生的进程。请仔细阅读逻辑并让我知道您是否有其他想法。尽管我更喜欢集群,但您应该使用尽可能多的CPU分叉,以便所有内核同时工作(在处理器的内核之间分配工作)。但这似乎不在您的答案中,当您的cpu只有1000个进程时,您不能只分叉1000个进程