Javascript 如何处理30';1000个URL没有内存泄漏?

Javascript 如何处理30';1000个URL没有内存泄漏?,javascript,node.js,performance,fs,Javascript,Node.js,Performance,Fs,在.txt文件中有30000个url需要刮取,当我制作程序时,我用10个url测试它,一切都很好,当我制作了30k url文件.txt后,它在几分钟后崩溃,我想它开始读取.txt文件,然后由于内存问题崩溃,这是控制台输出和我的代码。处理此类文件的最佳方式是什么 致命错误:无效的标记压缩接近堆限制分配失败-JavaScript堆内存不足 1:0x100ba0c4a节点::中止()(.cold.1)[/usr/local/bin/node] 2:0x100084961节点::FatalError(字

在.txt文件中有30000个url需要刮取,当我制作程序时,我用10个url测试它,一切都很好,当我制作了30k url文件.txt后,它在几分钟后崩溃,我想它开始读取.txt文件,然后由于内存问题崩溃,这是控制台输出和我的代码。处理此类文件的最佳方式是什么

致命错误:无效的标记压缩接近堆限制分配失败-JavaScript堆内存不足 1:0x100ba0c4a节点::中止()(.cold.1)[/usr/local/bin/node] 2:0x100084961节点::FatalError(字符常量*,字符常量*)[/usr/local/bin/node] 3:0x100084a89节点::OnFatalError(char const*,char const*)[/usr/local/bin/node] 4:0x10017fa4d v8::Utils::ReportOOMFailure(v8::internal::Isolate*,char const*,bool)[/usr/local/bin/node] 5:0x10017f9f7 v8::internal::v8::FatalProcessOutOfMemory(v8::internal::Isolate*,char const*,bool)[/usr/local/bin/node] 6:0x100299baf v8::internal::Heap::FatalProcessOutOfMemory(char const*)[/usr/local/bin/node] 7:0x10029af4c v8::内部::堆::MarkCompactPrologue()[/usr/local/bin/node] 8:0x100298b04 v8::internal::Heap::PerformGarbageCollection(v8::internal::GarbageCollector,v8::GCCallbackFlags)[/usr/local/bin/node] 9:0x1002975ab v8::internal::Heap::CollectGarbage(v8::internal::AllocationSpace,v8::internal::GarbageCollectionReason,v8::GCCallbackFlags)[/usr/local/bin/node] 10:0x100296A v8::internal::Heap::HandleGCRequest()[/usr/local/bin/node] 11:0x10026d9a5 v8::internal::StackGuard::HandleInterrupts()[/usr/local/bin/node] 12:0x1004e1383 v8::internal::Runtime_StackGuard(int,unsigned long*,v8::internal::Isolate*)[/usr/local/bin/node] 13:0x1007502f9内置文件\u CEntry\u Return1\u DontSaveFPRegs\u ArgvOnStack\u NoBuiltinExit[/usr/local/bin/node] 14:0x10073c5fb内置\u StringPrototypeMatch[/usr/local/bin/node] 15:0x267b75f209cb zsh:abort node scrape.js

让cheerio=require('cheerio');
let request=require('request');
让UserAgent=require('user-agent');
设axios=require('axios');
const fileUrlErrors=“UrlsWithErrors.txt”;
const async=require('async')
让承诺=要求(“蓝鸟”);
让userAgent=newuseragent({deviceCategory:'desktop'});
让选项={
标题:{userAgent}
};
让exec=require('child_process')。exec;
const mysql=require('mysql2/promise');
让con=mysql.createPool({
主持人:“xxx.xxx.xxx.xxx”,
用户:“xxx”,
密码:“xxxx”,
数据库:“xxx”
});
异步函数run(){
让file=fs.readFileSync('url.txt');
让URL=file.toString().split('\r\n');
日志(URL);
const numourl=url.length;
设urlsArray=[];
控制台日志(“numeroUrl:+numeroUrl”);
for(设i=1;i{
如果(错误){
错误(`无法将url状态错误保存到文件:${error}`);
返回;
}
log('将Url错误保存到'+fileUrlErrors');
});  
}否则如果(response.status==200){
让$=cheerio.load(response.data);
prodotti=$(“.item”);
let items=$(prodotti.get();
对于(让项目中的项目){
让title=$(“.title”,item).text();
如果(!标题){
title=$(“.title2”,item).text();
}
价格=$(“.price”,item).text();
如果(!价格){
价格=$(“.price2”,item).text();
}
如果(标题){
常数prodotto=[
[
标题
价格]
];
让结果=等待con.query(“插入项目(标题、价格)值?在重复键上更新价格=值(价格)”,[prodotto]);
log('Prodotto'+title+'inserito nel DB');
控制台日志(prodotto);
}
}
} 
}捕获(错误){
//控制台错误(error);
if(error.response){
//请求已发出,服务器已响应
等待fs.appendFile(fileUrlErrors,'\n'+url+“-”+error.response.status,(error)=>{
如果(错误){
错误(`无法将url状态错误保存到文件:${error}`);
返回;
}
log('将Url错误保存到'+fileUrlErrors');
});  
}
}
}
运行(),然后(()=>{
console.log(“完成!”);
}).catch(错误=>{
控制台日志(err);
});

正如评论中所讨论的,您的
parseUrl()
函数混合了承诺和简单的异步回调,这是一场灾难。你真的不能把它们混在一起。最好的解决方案是使用Promission执行所有异步控制流,如果有一些非promise返回的异步回调,则使用
util.promisify()
,通过使用正确的promisified版本的API或获取包含promise支持的库的正确版本,手动对它们进行promisify

一旦您将所有内容都转换为promise控制流,就可以使用
async
await
以及其他promise控制流工具,只有这样您的
parseUrl()才会
函数返回一个承诺,该承诺只有在所有底层异步操作完成后才能得到解决,只有这样,您才能进行正确的错误传播

下面的示例修复了
parseUrl()
以正确使用对所有人的承诺
let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");

let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
  headers: { userAgent }
};
let exec = require('child_process').exec;

const mysql = require('mysql2/promise');
let con = mysql.createPool({
      host: "xxx.xxx.xxx.xxx",
      user: "xxx",
      password: "xxxx",
      database: "xxx"
    });


async function run() {

    let file = fs.readFileSync('urls.txt');
    let urls = file.toString().split('\r\n');

    console.log(urls);

    const numeroUrl = urls.length;
    let urlsArray = [];
    console.log("numeroUrl : " + numeroUrl);
    for (let i = 1; i < numeroUrl; i++) {
      for (let y = 1; y < 6; y++) {
        urlsArray.push(urls[y-1] + '&page=' + y);
      }
    }
    Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
        // all done here
        console.log("Done!!!");
    });
}


async function parseUrl(url) {
  try {
    let response = await axios.get(url, {
      headers: { 
        'User-Agent': new UserAgent() 
      }  
    });
    console.log(url + " " + response.status);
    if (response.status >= 201) {

      fs.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status, (error) => {
        if (error) {
            console.error(`Could not save the url status error to a file: ${error}`);
            return;
        }

        console.log('Saved Url error to ' + fileUrlErrors);
        });  
      
    } else if (response.status == 200) {

      let $ = cheerio.load(response.data);

      prodotti = $(".item");

      let items = $(prodotti).get();

      for (let item of items) {

        let title = $(".title", item).text();
        if (!title) {
          title = $(".title2", item).text();
        }
        
        let price = $(".price", item).text();
        if (!price) {
          price = $(".price2", item).text();
        }
        

        if (title) {
          const prodotto = [
          [
          title,
          price]
          ];
          let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
          console.log('Prodotto ' + title + ' inserito nel DB.');
          console.log(prodotto);
        }

        }
    } 

    } catch (error) {
        //console.error(error);
        if (error.response) {
          // Request made and server responded
            await fs.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status, (error) => {
            if (error) {
                console.error(`Could not save the url status error to a file: ${error}`);
                return;
            }
            console.log('Saved Url error to ' + fileUrlErrors);
            });  
        }

    }
}

run().then(() => {
    console.log("Done!");
}).catch(err => {
    console.log(err);
});
const fs = require('fs');

async function run() {
    const file = fs.readFileSync('urls.txt');
    const urls = file.toString().split('\r\n');

    // count the number of urls inside .txt file
    const numberOfUrls = urls.length;
    console.log("There are : " + numberOfUrls + " urls");

    // Add page to url and use the scrape function
    for (let i = 1; i < numberOfUrls; i++) {
        for (let y = 1; y < 6; y++) {
            let url = urls[y - 1] + '&page=' + y;
            await parseUrl(url);
        }
    }

    async function parseUrl(url) {
        try {
            const response = await axios.get(url, {
                headers: {
                    'User-Agent': new UserAgent()
                }
            });
            if (response.status >= 201) {
                await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
            }  else if (response.status == 200) {
                const $ = cheerio.load(response.data);
                const prodotti = $(".result");

                // get items into a normal array so we can use a normal for loop
                const items = $(prodotti).get();
                for (let item of items) {
                    const title = $("title", item).text();
                    const code = $(".code", item).text();
                    if (asin[1]) {
                        const prodotto = [
                            [title, code]
                        ];
                        // promise support in your mysql database requires the mysql2 module
                        const result = await con.query("INSERT INTO Items (title, code) VALUES ? ON DUPLICATE KEY UPDATE code=VALUES(code)", [prodotto]);
                        console.log('Prodotto ' + code + ' inserito nel DB.');
                        console.log(prodotto);
                    }

                }
            }
        } catch (error) {
            console.error(error);
            throw error;          // propagate error back to caller
        }
    }
}

run().then(() => {
    console.log("all done");
}).catch(err => {
    console.log(err);
});
const fs = require('fs');
let cheerio = require('cheerio');
let request = require('request');
let UserAgent = require('user-agents');
let axios = require('axios');
const fileUrlErrors = "UrlsWithErrors.txt";
const async = require('async')
let Promise = require("bluebird");

let userAgent = new UserAgent({ deviceCategory: 'desktop' });
let options = {
  headers: { userAgent }
};
let exec = require('child_process').exec;

const mysql = require('mysql2/promise');
let con = mysql.createPool({
      host: "xxx.xxx.xxx.xxx",
      user: "xxx",
      password: "xxxx",
      database: "xxx"
    });

async function run() {

    let file = fs.readFileSync('urls.txt');
    let urls = file.toString().split('\r\n');

    console.log(urls);

    const numeroUrl = urls.length;
    let urlsArray = [];
    console.log("numeroUrl : " + numeroUrl);
    for (let i = 1; i < numeroUrl; i++) {
      for (let y = 1; y < 6; y++) {
        urlsArray.push(urls[i-1] + '&page=' + y);
      }
    }
    Promise.map(urlsArray, parseUrl, {concurrency: 10}).then(function(data) {
        // all done here
        console.log("Done!");

    });
}


async function parseUrl(url) {
  try {
    let response = await axios.get(url, {
      headers: { 
        'User-Agent': new UserAgent() 
      }  
    });
    console.log(url + " " + response.status);
    if (response.status >= 201) {

      await fs.promises.appendFile(fileUrlErrors, '\n' + url + ' - ' + response.status);
        console.log('Saved Url error to ' + fileUrlErrors); 
      
    } else if (response.status == 200) {

      let $ = cheerio.load(response.data);

      prodotti = $(".item");

      let items = $(prodotti).get();

      for (let item of items) {

        let title = $(".title", item).text();
        if (!title) {
          title = $(".title2", item).text();
        }
        
        let price = $(".price", item).text();
        if (!price) {
          price = $(".price2", item).text();
        }
        

        if (title) {
          const prodotto = [
          [
          title,
          price]
          ];
          let result = await con.query("INSERT INTO Items (title, price) VALUES ? ON DUPLICATE KEY UPDATE price=VALUES(price)", [prodotto]);
          console.log('Prodotto ' + title + ' inserito nel DB.');
          console.log(prodotto);
        }

        }
    } 

    } catch (error) {
        //console.error(error);
        if (error.response) {
          // Request made and server responded
            await fs.promises.appendFile(fileUrlErrors, '\n' + url + " - " + error.response.status);
            console.log('Saved Url error to ' + fileUrlErrors);
        }

    }
}

run().then(() => {
    console.log("Done!");
}).catch(err => {
    console.log(err);
});