Javascript 如何处理30';1000个URL没有内存泄漏?
在.txt文件中有30000个url需要刮取,当我制作程序时,我用10个url测试它,一切都很好,当我制作了30k url文件.txt后,它在几分钟后崩溃,我想它开始读取.txt文件,然后由于内存问题崩溃,这是控制台输出和我的代码。处理此类文件的最佳方式是什么 致命错误:无效的标记压缩接近堆限制分配失败-JavaScript堆内存不足 1:0x100ba0c4a节点::中止()(.cold.1)[/usr/local/bin/node] 2:0x100084961节点::FatalError(字符常量*,字符常量*)[/usr/local/bin/node] 3:0x100084a89节点::OnFatalError(char const*,char const*)[/usr/local/bin/node] 4:0x10017fa4d v8::Utils::ReportOOMFailure(v8::internal::Isolate*,char const*,bool)[/usr/local/bin/node] 5:0x10017f9f7 v8::internal::v8::FatalProcessOutOfMemory(v8::internal::Isolate*,char const*,bool)[/usr/local/bin/node] 6:0x100299baf v8::internal::Heap::FatalProcessOutOfMemory(char const*)[/usr/local/bin/node] 7:0x10029af4c v8::内部::堆::MarkCompactPrologue()[/usr/local/bin/node] 8:0x100298b04 v8::internal::Heap::PerformGarbageCollection(v8::internal::GarbageCollector,v8::GCCallbackFlags)[/usr/local/bin/node] 9:0x1002975ab v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace,v8::internal::GarbageCollectionReason,v8::GCCallbackFlags)[/usr/local/bin/node] 10:0x100296A v8::internal::Heap::HandleGCRequest()[/usr/local/bin/node] 11:0x10026d9a5 v8::internal::StackGuard::HandleInterrupts()[/usr/local/bin/node] 12:0x1004e1383 v8::internal::Runtime_StackGuard(int,unsigned long*,v8::internal::Isolate*)[/usr/local/bin/node] 13:0x1007502f9内置文件\u CEntry\u Return1\u DontSaveFPRegs\u ArgvOnStack\u NoBuiltinExit[/usr/local/bin/node] 14:0x10073c5fb内置\u StringPrototypeMatch[/usr/local/bin/node] 15:0x267b75f209cb zsh:abort node scrape.jsJavascript 如何处理30';1000个URL没有内存泄漏?,javascript,node.js,performance,fs,Javascript,Node.js,Performance,Fs,在.txt文件中有30000个url需要刮取,当我制作程序时,我用10个url测试它,一切都很好,当我制作了30k url文件.txt后,它在几分钟后崩溃,我想它开始读取.txt文件,然后由于内存问题崩溃,这是控制台输出和我的代码。处理此类文件的最佳方式是什么 致命错误:无效的标记压缩接近堆限制分配失败-JavaScript堆内存不足 1:0x100ba0c4a节点::中止()(.cold.1)[/usr/local/bin/node] 2:0x100084961节点::FatalError(字
让cheerio=require('cheerio');
let request=require('request');
让UserAgent=require('user-agent');
设axios=require('axios');
const fileUrlErrors=“UrlsWithErrors.txt”;
const async=require('async')
让承诺=要求(“蓝鸟”);
让userAgent=newuseragent({deviceCategory:'desktop'});
让选项={
标题:{userAgent}
};
让exec=require('child_process')。exec;
const mysql=require('mysql2/promise');
让con=mysql.createPool({
主持人:“xxx.xxx.xxx.xxx”,
用户:“xxx”,
密码:“xxxx”,
数据库:“xxx”
});
异步函数run(){
让file=fs.readFileSync('url.txt');
让URL=file.toString().split('\r\n');
日志(URL);
const numourl=url.length;
设urlsArray=[];
控制台日志(“numeroUrl:+numeroUrl”);
for(设i=1;i=201){
fs.appendFile(fileUrlErrors,'\n'+url+'-'+response.status,(error)=>{
如果(错误){
错误(`无法将url状态错误保存到文件:${error}`);
返回;
}
log('将Url错误保存到'+fileUrlErrors');
});
}否则如果(response.status==200){
让$=cheerio.load(response.data);
prodotti=$(“.item”);
let items=$(prodotti.get();
对于(让项目中的项目){
让title=$(“.title”,item).text();
如果(!标题){
title=$(“.title2”,item).text();
}
价格=$(“.price”,item).text();
如果(!价格){
价格=$(“.price2”,item).text();
}
如果(标题){
常数prodotto=[
[
标题
价格]
];
让结果=等待con.query(“插入项目(标题、价格)值?在重复键上更新价格=值(价格)”,[prodotto]);
log('Prodotto'+title+'inserito nel DB');
控制台日志(prodotto);
}
}
}
}捕获(错误){
//控制台错误(error);
if(error.response){
//请求已发出,服务器已响应
等待fs.appendFile(fileUrlErrors,'\n'+url+“-”+error.response.status,(error)=>{
如果(错误){
错误(`无法将url状态错误保存到文件:${error}`);
返回;
}
log('将Url错误保存到'+fileUrlErrors');
});
}
}
}
运行(),然后(()=>{
console.log(“完成!”);
}).catch(错误=>{
控制台日志(err);
});
正如评论中所讨论的,您的parseUrl()
函数混合了承诺和简单的异步回调,这是一场灾难。你真的不能把它们混在一起。最好的解决方案是使用Promission执行所有异步控制流,如果有一些非promise返回的异步回调,则使用util.promisify()
,通过使用正确的promisified版本的API或获取包含promise支持的库的正确版本,手动对它们进行promisify
一旦您将所有内容都转换为promise控制流,就可以使用async
和await
以及其他promise控制流工具,只有这样您的parseUrl()才会
函数返回一个承诺,该承诺只有在所有底层异步操作完成后才能得到解决,只有这样,您才能进行正确的错误传播
下面的示例修复了parseUrl()
以正确使用对所有人的承诺
let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");
let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
headers: { userAgent }
};
let exec = require('child_process').exec;
const mysql = require('mysql2/promise');
let con = mysql.createPool({
host: "xxx.xxx.xxx.xxx",
user: "xxx",
password: "xxxx",
database: "xxx"
});
async function run() {
let file = fs.readFileSync('urls.txt');
let urls = file.toString().split('\r\n');
console.log(urls);
const numeroUrl = urls.length;
let urlsArray = [];
console.log("numeroUrl : " + numeroUrl);
for (let i = 1; i < numeroUrl; i++) {
for (let y = 1; y < 6; y++) {
urlsArray.push(urls[y-1] + '&page=' + y);
}
}
Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
// all done here
console.log("Done!!!");
});
}
async function parseUrl(url) {
try {
let response = await axios.get(url, {
headers: {
'User-Agent': new UserAgent()
}
});
console.log(url + " " + response.status);
if (response.status >= 201) {
fs.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status, (error) => {
if (error) {
console.error(`Could not save the url status error to a file: ${error}`);
return;
}
console.log('Saved Url error to ' + fileUrlErrors);
});
} else if (response.status == 200) {
let $ = cheerio.load(response.data);
prodotti = $(".item");
let items = $(prodotti).get();
for (let item of items) {
let title = $(".title", item).text();
if (!title) {
title = $(".title2", item).text();
}
let price = $(".price", item).text();
if (!price) {
price = $(".price2", item).text();
}
if (title) {
const prodotto = [
[
title,
price]
];
let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
console.log('Prodotto ' + title + ' inserito nel DB.');
console.log(prodotto);
}
}
}
} catch (error) {
//console.error(error);
if (error.response) {
// Request made and server responded
await fs.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status, (error) => {
if (error) {
console.error(`Could not save the url status error to a file: ${error}`);
return;
}
console.log('Saved Url error to ' + fileUrlErrors);
});
}
}
}
run().then(() => {
console.log("Done!");
}).catch(err => {
console.log(err);
});
const fs = require('fs');
async function run() {
const file = fs.readFileSync('urls.txt');
const urls = file.toString().split('\r\n');
// count the number of urls inside .txt file
const numberOfUrls = urls.length;
console.log("There are : " + numberOfUrls + " urls");
// Add page to url and use the scrape function
for (let i = 1; i < numberOfUrls; i++) {
for (let y = 1; y < 6; y++) {
let url = urls[y - 1] + '&page=' + y;
await parseUrl(url);
}
}
async function parseUrl(url) {
try {
const response = await axios.get(url, {
headers: {
'User-Agent': new UserAgent()
}
});
if (response.status >= 201) {
await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
} else if (response.status == 200) {
const $ = cheerio.load(response.data);
const prodotti = $(".result");
// get items into a normal array so we can use a normal for loop
const items = $(prodotti).get();
for (let item of items) {
const title = $("title", item).text();
const code = $(".code", item).text();
if (asin[1]) {
const prodotto = [
[title, code]
];
// promise support in your mysql database requires the mysql2 module
const result = await con.query("INSERT INTO Items (title, code) VALUES ? ON DUPLICATE KEY UPDATE code=VALUES(code)", [prodotto]);
console.log('Prodotto ' + code + ' inserito nel DB.');
console.log(prodotto);
}
}
}
} catch (error) {
console.error(error);
throw error; // propagate error back to caller
}
}
}
run().then(() => {
console.log("all done");
}).catch(err => {
console.log(err);
});
const fs = require('fs');
let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");
let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
headers: { userAgent }
};
let exec = require('child_process').exec;
const mysql = require('mysql2/promise');
let con = mysql.createPool({
host: "xxx.xxx.xxx.xxx",
user: "xxx",
password: "xxxx",
database: "xxx"
});
async function run() {
let file = fs.readFileSync('urls.txt');
let urls = file.toString().split('\r\n');
console.log(urls);
const numeroUrl = urls.length;
let urlsArray = [];
console.log("numeroUrl : " + numeroUrl);
for (let i = 1; i < numeroUrl; i++) {
for (let y = 1; y < 6; y++) {
urlsArray.push(urls[i-1] + '&page=' + y);
}
}
Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
// all done here
console.log("Done!");
});
}
async function parseUrl(url) {
try {
let response = await axios.get(url, {
headers: {
'User-Agent': new UserAgent()
}
});
console.log(url + " " + response.status);
if (response.status >= 201) {
await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
console.log('Saved Url error to ' + fileUrlErrors);
} else if (response.status == 200) {
let $ = cheerio.load(response.data);
prodotti = $(".item");
let items = $(prodotti).get();
for (let item of items) {
let title = $(".title", item).text();
if (!title) {
title = $(".title2", item).text();
}
let price = $(".price", item).text();
if (!price) {
price = $(".price2", item).text();
}
if (title) {
const prodotto = [
[
title,
price]
];
let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
console.log('Prodotto ' + title + ' inserito nel DB.');
console.log(prodotto);
}
}
}
} catch (error) {
//console.error(error);
if (error.response) {
// Request made and server responded
await fs.promises.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status);
console.log('Saved Url error to ' + fileUrlErrors);
}
}
}
run().then(() => {
console.log("Done!");
}).catch(err => {
console.log(err);
});