Javascript 为什么我的for循环会弄乱所有参数?

Javascript 为什么我的for循环会弄乱所有参数?,javascript,parsing,for-loop,timeout,Javascript,Parsing,For Loop,Timeout,我正在尝试使用javascript解析来自多个网页的一些数据。为此,我编写了一个小型解析器。算法如下所示: URL 1 + data from URL 1 (correct line) URL 2 + data from URL 2 (correct line) URL 3 + data from URL 3 (correct line) URL 4 + data from URL 4 (correct line) URL 6(wrong URL) + data from another

我正在尝试使用javascript解析来自多个网页的一些数据。为此,我编写了一个小型解析器。算法如下所示:

URL 1 + data from URL 1  (correct line)
URL 2 + data from URL 2  (correct line)
URL 3 + data from URL 3  (correct line)
URL 4 + data from URL 4  (correct line)
URL 6(wrong URL) + data from another URL 
URL 5(wrong URL) + data from another URL
URL 7 + data from URL 7  (correct line)
URL 8 + data from URL 8  (correct line)
URL 9 + data from URL 9  (correct line)
  • 首先从my.csv文件中打开
    URL
  • 在页面上找到我需要的
    数据
  • URL
    data
    保存到json文件
  • 我的代码执行1。二,。非常完美,但有时会把数字3弄糟。输出如下所示:

    URL 1 + data from URL 1  (correct line)
    URL 2 + data from URL 2  (correct line)
    URL 3 + data from URL 3  (correct line)
    URL 4 + data from URL 4  (correct line)
    URL 6(wrong URL) + data from another URL 
    URL 5(wrong URL) + data from another URL
    URL 7 + data from URL 7  (correct line)
    URL 8 + data from URL 8  (correct line)
    URL 9 + data from URL 9  (correct line)
    
    我认为问题在于一些页面加载时间过长,这会打乱整个过程。但我仍然不明白为什么它有时会保存错误的数据。 这是我的密码:

    var request = require('request');
    var cheerio = require('cheerio');
    var cloudscraper = require('cloudscraper');
    var fs = require('fs');
    var path = require('path');
    var csvjson = require('csvjson');
    
    //First, we read .csv file with our URL list
    function getTheList() {
        urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
        var options = {
          delimiter : ';', // optional
          quote     : '"' // optional
         };
        urlList = csvjson.toObject(urlList, options);
        end = urlList.length;
        logs = [];
    
    //here we start the loop reading and saving data from each url
      for (let p = 0; p < end; p += 1){
        grabTheData(urlList, p)
      }
    } 
    
    //this code extracts the data from the page and saves it to a json file
    function grabTheData(urlList, p){
        setTimeout(function() { 
    
        url = url[p].ItemLink;
        cloudscraper.get(url, function(err, res, body){
            if (err) { 
                console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
                callback();
    
            } else {
    
                var $ = cheerio.load(body);
                /*
                here are the lines which extract the data I need
                dataIneed = ...;
                */
    
                logs.push({
                    url, dataINeed
                });
                fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
            }
        });
    //here I set a 2 seconds delay between each URL
        }, 2000 * p);
      }
    
    getTheList()
    
    var request=require('request');
    var cheerio=需要('cheerio');
    var cloudscraper=require('cloudscraper');
    var fs=需要('fs');
    var path=require('path');
    var csvjson=require('csvjson');
    //首先,我们读取带有URL列表的.csv文件
    函数getTheList(){
    urlList=fs.readFileSync(path.join(uu dirname,'data.csv'),{encoding:'utf8'});
    变量选项={
    分隔符:“;”,//可选
    引号:“”//可选
    };
    urlList=csvjson.toObject(urlList,选项);
    end=urlist.length;
    日志=[];
    //在这里,我们开始从每个url读取和保存数据的循环
    对于(设p=0;p
    发生这种情况的原因是回调结果与grabTheData中的url变量之间存在潜在的不匹配

    现在有一个非常快速的解决方法,简单地更改url变量的范围,如下所示:

    function grabTheData(urlList, p){
        setTimeout(function() { 
            // Set scope of url variable to block
            let url = url[p].ItemLink;
            cloudscraper.get(url, function(err, res, body){
                if (err) { 
                    console.log(other.Time() + colors.yellow('Warn: ') + '- something went wrong with item ' + url);
                    callback();
    
                } else {
    
                    var $ = cheerio.load(body);
                    /*
                    here are the lines which extract the data I need
                    dataIneed = ...;
                    */
    
                    logs.push({
                        url, dataINeed
                    });
                    fs.writeFileSync('./logs.json', JSON.stringify(logs, null, 4));
                }
            });
    //here I set a 2 seconds delay between each URL
        }, 2000 * p);
    }
    
    这会让你的结果井然有序

    下面是另一个(IMHO更好)选项,使用承诺并避免使用setTimeout来分隔调用。这应该避免任何潜在的争用情况,因为Promise.all调用将保持顺序:

    async function getTheList() {
        urlList = fs.readFileSync(path.join(__dirname, 'data.csv'), { encoding : 'utf8'});
        var options = {
            delimiter : ';', // optional
            quote     : '"' // optional
        };
        urlList = csvjson.toObject(urlList, options);
        let promiseList = urlList.map(urlEntry => grabTheDataUpdated(urlEntry.ItemLink));
        let logs = await Promise.all(promiseList);
        fs.writeFileSync('./new_logs.json', JSON.stringify(logs, null, 4));
    }
    
    // Promisified version of cloudscraper.get
    function getCloudScraperData(url) {
        return new Promise((resolve, reject) => {
            cloudscraper.get(url, (err, res, body) => {
                if (err) {
                    reject(err);
                } else {
                    resolve ( { url, res, body });
                }
            })
        })
    }
    
    function getDataINeed(url, body) {
        // Use cheerio to process data..
        // Return mock data for now.. replace with actual data processed by cheerio..
        return `data from ${url}`;
    }
    
    async function grabTheDataUpdated(url) {
        try { 
            let result = await getCloudScraperData(url);
            let dataINeed = getDataINeed(result.url, result.body);
            return { url, dataINeed };
        } catch (error) { 
            return { url, dataINeed: "Error occurred: " + error.message };
        }
    }
    

    乍一看似乎是对setTimeout()的典型误用。我实际上使用了
    setTimeout()
    ,但我想我是做错了吗?