Javascript 是不是承诺。都不工作的第二次通过?为什么不呢?

Javascript 是不是承诺。都不工作的第二次通过?为什么不呢?,javascript,node.js,web-scraping,promise,Javascript,Node.js,Web Scraping,Promise,我刚刚完成一个T恤网站的基本webscraper项目 它通过一个硬编码的url(主页)进入。它将搜索任何产品页面,并将它们添加到url。如果它找到另一个链接(余数),它将再次删除该链接并找到更多的产品页面。它将产品页面添加到urlSet,然后再次刮取这些页面,获取tshirt数据(价格、img、标题),然后转换,然后将它们写入CSV文件 出于某种原因,这不适用于带有“剩余”的刮削的第二次运行 如果我删除url的第二个片段,那么一切都会正常进行,文件也会正确写入。但如果我想获得其他产品页面,它似乎

我刚刚完成一个T恤网站的基本webscraper项目

它通过一个硬编码的url(主页)进入。它将搜索任何产品页面,并将它们添加到url。如果它找到另一个链接(
余数
),它将再次删除该链接并找到更多的产品页面。它将产品页面添加到
urlSet
,然后再次刮取这些页面,获取tshirt数据(价格、img、标题),然后转换,然后将它们写入CSV文件

出于某种原因,这不适用于带有“剩余”的刮削的第二次运行

如果我删除url的第二个片段,那么一切都会正常进行,文件也会正确写入。但如果我想获得其他产品页面,它似乎在某个地方失败了

这是我的代码,我很抱歉发布了这么多,但我不知道如果没有正确的上下文,如何正确理解它,希望它已经被注释好了:

//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
    //Save the scraped data in a spreadsheet (CSV format).

'use strict';

//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');

//harcoded url
var url = 'http://shirts4mike.com/';

//url for tshirt pages
var urlSet = new Set();

var remainder;
var tshirtArray = [];


const requestPromise = function(url) {
    return new Promise(function(resolve, reject) {
        request(url, function(error, response, html) {

            if(error)return reject(error);

            if(!error && response.statusCode == 200){
                return resolve(html);   
            }       
        });
    });
}


// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
    console.log("Currently scraping " + url)
    return requestPromise(url)
        .then(function(html) {
            var $ = cheerio.load(html);

            var links = [];

            //get all the links
            $('a[href*=shirt]').each(function(){
                var a = $(this).attr('href');
                //add into link array
                links.push(url + a);
            });
            // return array of links
            return links;
        });
}


function nextStep (arrayOfLinks) { 
    var promiseArray = [];
    console.log(arrayOfLinks);
    for(var i = 0; i < arrayOfLinks.length; i++){
        promiseArray.push(requestPromise(arrayOfLinks[i]));
    }
    //return both the html of pages and their urls
    return Promise.all(promiseArray)
        .then(function(arrayOfHtml){
        return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
    });                 
}


//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){ 
    for(var i = 0;  i < obj.arrayOfHtml.length; i++){
        var $ = cheerio.load(obj.arrayOfHtml[i]);

        //if page has a submit it must be a product page
        if($('[type=submit]').length !== 0){

            //add page to set
            urlSet.add(obj.arrayOfUrls[i]);
            console.log(obj.arrayOfUrls[i]);

        } else if(remainder == undefined) {
            //if not a product page, add it to remainder so it another scrape can be performed.
            remainder = obj.arrayOfUrls[i];
            console.log("The remainder is " + remainder)                                     
        }
    }
    //return remainder for second run-through of scrape 
    return remainder;
}


//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
    //call lastScraper so we can grab data from the set (product pages)
        //scrape set, product pages
        var promiseArray = [];

        for(var item of urlSet){
            var url = item;

            promiseArray.push(requestPromise(url));
        }
        return Promise.all(promiseArray)
            .then(function(arrayOfHtml){
                return arrayOfHtml;
            });    
}


//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
    for(var i = 0; i < html.length; i++){
        var $ = cheerio.load(html[i]);

        //grab data and store as variables
        var price = $('.price').text();
        var imgURL = $('.shirt-picture').find('img').attr('src');
        var title = $('body').find('.shirt-details > h1').text().slice(4);

        var tshirtObject = {};
        //add values into tshirt object
        tshirtObject.Title = title;
        tshirtObject.Price = price;
        tshirtObject.ImageURL = imgURL;
        tshirtObject.URL = url;
        tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');

        //add the object into the array of tshirts
        tshirtArray.push(tshirtObject);
    }
    convertJson2Csv();
}


//convert tshirt objects and save as CSV file
function convertJson2Csv(){
        //The scraper should generate a folder called `data` if it doesn’t exist.
        var dir ='./data';
        if(!fs.existsSync(dir)){
            fs.mkdirSync(dir);
        }

        var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];

        //convert tshirt data into CSV and pass in fields
        var csv = json2csv({ data: tshirtArray, fields: fields });

        //Name of file will be the date
        var fileDate = moment().format('MM-DD-YY');
        var fileName = dir + '/' + fileDate + '.csv';

        //Write file
        fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
            console.log('file saved');
            if (err) throw err;
        });
}

scrape(url) //scrape from original entry point
    .then(nextStep) 
    .then(lastStep)
    .then(scrape) //scrape again but with remainder url
    .then(nextStep)
    .then(lastStep)
    .then(lastScraperPt1)
    .then(lastScraperPt2)
    .catch(function(err) {
        // handle any error from any request here
        console.log(err);
     });
但是如果我选择只调用第一个scrap而不调用第二个scrap,就像这样:

scrape(url) //scrape from original entry point
    .then(nextStep) 
    .then(lastStep)
    .then(lastScraperPt1)
    .then(lastScraperPt2)
    .catch(function(err) {
        // handle any error from any request here
        console.log(err);
     });
。。。然后一切都开始了。我只是不能访问所有的URL


这里发生了什么事,我该如何解决?谢谢大家

问题是
tshirtaray
convertJson2Csv()
中没有定义。在
lastscrapert2
pass
tshirtaray
to
convertJsonCsv()

at
convertJson2Csv

convertJson2Csv(tshirtArray)
function convertJson2Csv(tshirtArray) {
  // do stuff
}

问题是
tshirtaray
未在
convertJson2Csv()
中定义。在
lastscrapert2
pass
tshirtaray
to
convertJsonCsv()

at
convertJson2Csv

convertJson2Csv(tshirtArray)
function convertJson2Csv(tshirtArray) {
  // do stuff
}

您的
最后一步中似乎存在一个问题。看起来您的意思是,
余数
是另一个URL数组。如果我错了,请纠正我。但是,发生的情况是,
如果($('[type=submit]').length!==0)
条件第一次失败,您将自动转到下一个块,因为
余数
未定义。无论当前url是什么,都可以将该url分配给
余数
。对于For循环的其余迭代,您将永远不会再次遇到
rements==undefined
的情况。因此,如果您最终只将一个url分配给
余数
,而您希望得到的任何url都将被忽略

您可能希望将
余数定义为
余数=[]。然后,如果(余数==未定义)
,您只需说

} else {
    remainder.push(obj.arrayOfUrls[i]);
}
但是,当
scrape
只需要一个url时,您将向
scrape
传递一个url数组。如果这是您想要的,并且我正确地假设您的意思是,
余数
是一个URL数组,那么您可以定义一个新函数,如下所示:

function scrapeRemainders(remainders) {
  var promises = [];

  remainder.forEach(function (url) {
    promises.push(requestPromise(url));
  });

  return Promise.all(promises).then(function (results) {
    _.flattenDeep(results);
  })
}
然后,您将用
scrapeRemainders
替换承诺链中的第二个
scrape
。另外,对于前面函数中的
,您需要
npm安装lodash
,然后
var=require('lodash')
。另一方面,lodash与承诺无关,但它是数据处理的一个伟大工具。有机会的时候你应该调查一下

此外,在
LastScrapert1
中,您可以更改

return Promise.all(promiseArray)
    .then(function(arrayOfHtml){
        return arrayOfHtml;
    });

它做同样的事情


希望这有帮助。如果这不能回答您的问题,请对我进行评论,我可以相应地更改我的答案。

您的
最后一步中似乎有一个问题。看起来您的意思是,
余数
是另一个URL数组。如果我错了,请纠正我。但是,发生的情况是,
如果($('[type=submit]').length!==0)
条件第一次失败,您将自动转到下一个块,因为
余数
未定义。无论当前url是什么,都可以将该url分配给
余数
。对于For循环的其余迭代,您将永远不会再次遇到
rements==undefined
的情况。因此,如果您最终只将一个url分配给
余数
,而您希望得到的任何url都将被忽略

您可能希望将
余数定义为
余数=[]。然后,如果(余数==未定义)
,您只需说

} else {
    remainder.push(obj.arrayOfUrls[i]);
}
但是,当
scrape
只需要一个url时,您将向
scrape
传递一个url数组。如果这是您想要的,并且我正确地假设您的意思是,
余数
是一个URL数组,那么您可以定义一个新函数,如下所示:

function scrapeRemainders(remainders) {
  var promises = [];

  remainder.forEach(function (url) {
    promises.push(requestPromise(url));
  });

  return Promise.all(promises).then(function (results) {
    _.flattenDeep(results);
  })
}
然后,您将用
scrapeRemainders
替换承诺链中的第二个
scrape
。另外,对于前面函数中的
,您需要
npm安装lodash
,然后
var=require('lodash')
。另一方面,lodash与承诺无关,但它是数据处理的一个伟大工具。有机会的时候你应该调查一下

此外,在
LastScrapert1
中,您可以更改

return Promise.all(promiseArray)
    .then(function(arrayOfHtml){
        return arrayOfHtml;
    });

它做同样的事情


希望这有帮助。如果这不能回答您的问题,请对我进行评论,我可以相应地更改我的答案。

全部修复,它在
scrape()
中抓取了错误的URL。虽然我只是在将状态代码记录到控制台后才知道这一点:

//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
    //Save the scraped data in a spreadsheet (CSV format).

'use strict';

//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');

//harcoded url
var urlHome = 'http://shirts4mike.com/';

//url for tshirt pages
var urlSet = [];

var tshirtArray = [];


const requestPromise = function(url) {
    return new Promise(function(resolve, reject) {
        request(url, function(error, response, html) {

            if(error) { 
                errorHandler(error);
                return reject(error);
            }

            if(!error && response.statusCode == 200){
                return resolve(html);   
            }

            if(response.statusCode !== 200){
                console.log("response code is " + response.statusCode);
            }

            return resolve("");      
        });
    });
}


// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
    console.log("Currently scraping " + url)
    return requestPromise(url)
        .then(function(html) {
            var $ = cheerio.load(html);

            var links = [];
            var URL = 'http://shirts4mike.com/';
            //get all the links
            $('a[href*=shirt]').each(function(){
                var a = $(this).attr('href');
                //add into link array
                links.push(URL + a);
            });
            // return array of links
            return links;
        });
}




function nextStep (arrayOfLinks) { 
    var promiseArray = [];
    console.log(arrayOfLinks);
    for(var i = 0; i < arrayOfLinks.length; i++){
        promiseArray.push(requestPromise(arrayOfLinks[i]));
    }
    //return both the html of pages and their urls
    return Promise.all(promiseArray)
        .then(function(arrayOfHtml){
        return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
    });                 
}


//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){ 
    for(var i = 0;  i < obj.arrayOfHtml.length; i++){
        var $ = cheerio.load(obj.arrayOfHtml[i]);

        //if page has a submit it must be a product page
        if($('[type=submit]').length !== 0){

            //add page to set
            urlSet.push(obj.arrayOfUrls[i]);
            console.log(obj.arrayOfUrls[i]);

        } else if(remainder == undefined) {
            //if not a product page, add it to remainder so it another scrape can be performed.
            var remainder = obj.arrayOfUrls[i];
            console.log("The remainder is " + remainder)                                     
        }
    }
    //return remainder for second run-through of scrape 
    return remainder;
}


//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
    //call lastScraper so we can grab data from the set (product pages)
        //scrape set, product pages
        var promiseArray = [];

        for(var item of urlSet){
            var url = item;

            promiseArray.push(requestPromise(url));
        }
        return Promise.all(promiseArray)
            .then(function(arrayOfHtml){
                return arrayOfHtml;
            });    
}


//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
    for(var i = 0; i < html.length; i++){
        var $ = cheerio.load(html[i]);

        //grab data and store as variables
        var price = $('.price').text();
        var imgURL = $('.shirt-picture').find('img').attr('src');
        var title = $('body').find('.shirt-details > h1').text().slice(4);

        var tshirtObject = {};
        //add values into tshirt object
        tshirtObject.Title = title;
        tshirtObject.Price = price;
        tshirtObject.ImageURL = urlHome + imgURL;
        tshirtObject.URL = urlSet[i];
        tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');

        //add the object into the array of tshirts
        tshirtArray.push(tshirtObject);
    }
    return tshirtArray;
}


//conver tshirt objects and save as CSV file
function convertJson2Csv(tshirtArray){
        //The scraper should generate a folder called `data` if it doesn’t exist.
        var dir ='./data';
        if(!fs.existsSync(dir)){
            fs.mkdirSync(dir);
        }

        var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];

        //convert tshirt data into CSV and pass in fields
        var csv = json2csv({ data: tshirtArray, fields: fields });

        //Name of file will be the date
        var fileDate = moment().format('MM-DD-YY');
        var fileName = dir + '/' + fileDate + '.csv';

        //Write file
        fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
            console.log('file saved');
            if (err) errorHandler(err);
        });
}


scrape(urlHome) //scrape from original entry point
    .then(nextStep) 
    .then(lastStep)
    .then(scrape)
    .then(nextStep)
    .then(lastStep)
    .then(lastScraperPt1)
    .then(lastScraperPt2)
    .then(convertJson2Csv)
    .catch(function(err) {
        // handle any error from any request here
        console.log(err);
     });


//If the site is down, an error message describing the issue should appear in the console. 
    //This is to be tested by disabling wifi on your device.
    //When an error occurs log it to a file scraper-error.log . It should append to the bottom of the file with a time stamp and error

var errorHandler = function (error) {
    console.log(error.message);
    console.log('The scraper could not not scrape data from ' + url + ' there is either a problem with your internet connection or the site may be down');
    /**
    * create new date for log file
     */
    var loggerDate = new Date();
    /**
     * create message as a variable
    */
    var errLog = '[' + loggerDate + '] ' + error.message + '\n';
    /**
    *when the error occurs, log that to the error logger file
    */
    fs.appendFile('scraper-error.log', errLog, function (err) {
        if (err) throw err;
        console.log('There was an error. The error was logged to scraper-error.log');
    });
};
//任务:创建一个命令行应用程序,用于访问电子商务站点以获取最新价格。
//在电子表格(CSV格式)中保存刮取的数据。
"严格使用",;
//正在使用的模块:
var cheerio=需要('cheerio');
var json2csv=require('json2csv');
var请求=要求(“请求”);
var力矩=要求的(‘力矩’);
var fs=需要('fs');
//加密url
var urlH