Javascript 是不是承诺。都不工作的第二次通过?为什么不呢?
我刚刚完成一个T恤网站的基本webscraper项目 它通过一个硬编码的url(主页)进入。它将搜索任何产品页面,并将它们添加到url。如果它找到另一个链接(Javascript 是不是承诺。都不工作的第二次通过?为什么不呢?,javascript,node.js,web-scraping,promise,Javascript,Node.js,Web Scraping,Promise,我刚刚完成一个T恤网站的基本webscraper项目 它通过一个硬编码的url(主页)进入。它将搜索任何产品页面,并将它们添加到url。如果它找到另一个链接(余数),它将再次删除该链接并找到更多的产品页面。它将产品页面添加到urlSet,然后再次刮取这些页面,获取tshirt数据(价格、img、标题),然后转换,然后将它们写入CSV文件 出于某种原因,这不适用于带有“剩余”的刮削的第二次运行 如果我删除url的第二个片段,那么一切都会正常进行,文件也会正确写入。但如果我想获得其他产品页面,它似乎
余数
),它将再次删除该链接并找到更多的产品页面。它将产品页面添加到urlSet
,然后再次刮取这些页面,获取tshirt数据(价格、img、标题),然后转换,然后将它们写入CSV文件
出于某种原因,这不适用于带有“剩余”的刮削的第二次运行
如果我删除url的第二个片段,那么一切都会正常进行,文件也会正确写入。但如果我想获得其他产品页面,它似乎在某个地方失败了
这是我的代码,我很抱歉发布了这么多,但我不知道如果没有正确的上下文,如何正确理解它,希望它已经被注释好了:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error)return reject(error);
if(!error && response.statusCode == 200){
return resolve(html);
}
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(url + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = imgURL;
tshirtObject.URL = url;
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
convertJson2Csv();
}
//convert tshirt objects and save as CSV file
function convertJson2Csv(){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) throw err;
});
}
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape) //scrape again but with remainder url
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
但是如果我选择只调用第一个scrap而不调用第二个scrap,就像这样:
scrape(url) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
。。。然后一切都开始了。我只是不能访问所有的URL
这里发生了什么事,我该如何解决?谢谢大家问题是
tshirtaray
在convertJson2Csv()
中没有定义。在lastscrapert2
passtshirtaray
toconvertJsonCsv()
atconvertJson2Csv
convertJson2Csv(tshirtArray)
function convertJson2Csv(tshirtArray) {
// do stuff
}
问题是
tshirtaray
未在convertJson2Csv()
中定义。在lastscrapert2
passtshirtaray
toconvertJsonCsv()
atconvertJson2Csv
convertJson2Csv(tshirtArray)
function convertJson2Csv(tshirtArray) {
// do stuff
}
您的
最后一步中似乎存在一个问题。看起来您的意思是,余数
是另一个URL数组。如果我错了,请纠正我。但是,发生的情况是,如果($('[type=submit]').length!==0)
条件第一次失败,您将自动转到下一个块,因为余数
未定义。无论当前url是什么,都可以将该url分配给余数
。对于For循环的其余迭代,您将永远不会再次遇到rements==undefined
的情况。因此,如果您最终只将一个url分配给余数
,而您希望得到的任何url都将被忽略
您可能希望将余数定义为余数=[]代码>。然后,如果(余数==未定义)
,您只需说
} else {
remainder.push(obj.arrayOfUrls[i]);
}
但是,当scrape
只需要一个url时,您将向scrape
传递一个url数组。如果这是您想要的,并且我正确地假设您的意思是,余数
是一个URL数组,那么您可以定义一个新函数,如下所示:
function scrapeRemainders(remainders) {
var promises = [];
remainder.forEach(function (url) {
promises.push(requestPromise(url));
});
return Promise.all(promises).then(function (results) {
_.flattenDeep(results);
})
}
然后,您将用scrapeRemainders
替换承诺链中的第二个scrape
。另外,对于前面函数中的
,您需要npm安装lodash
,然后var=require('lodash')
。另一方面,lodash与承诺无关,但它是数据处理的一个伟大工具。有机会的时候你应该调查一下
此外,在LastScrapert1
中,您可以更改
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
到
它做同样的事情
希望这有帮助。如果这不能回答您的问题,请对我进行评论,我可以相应地更改我的答案。您的最后一步中似乎有一个问题。看起来您的意思是,余数
是另一个URL数组。如果我错了,请纠正我。但是,发生的情况是,如果($('[type=submit]').length!==0)
条件第一次失败,您将自动转到下一个块,因为余数
未定义。无论当前url是什么,都可以将该url分配给余数
。对于For循环的其余迭代,您将永远不会再次遇到rements==undefined
的情况。因此,如果您最终只将一个url分配给余数
,而您希望得到的任何url都将被忽略
您可能希望将余数定义为余数=[]代码>。然后,如果(余数==未定义)
,您只需说
} else {
remainder.push(obj.arrayOfUrls[i]);
}
但是,当scrape
只需要一个url时,您将向scrape
传递一个url数组。如果这是您想要的,并且我正确地假设您的意思是,余数
是一个URL数组,那么您可以定义一个新函数,如下所示:
function scrapeRemainders(remainders) {
var promises = [];
remainder.forEach(function (url) {
promises.push(requestPromise(url));
});
return Promise.all(promises).then(function (results) {
_.flattenDeep(results);
})
}
然后,您将用scrapeRemainders
替换承诺链中的第二个scrape
。另外,对于前面函数中的
,您需要npm安装lodash
,然后var=require('lodash')
。另一方面,lodash与承诺无关,但它是数据处理的一个伟大工具。有机会的时候你应该调查一下
此外,在LastScrapert1
中,您可以更改
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
到
它做同样的事情
希望这有帮助。如果这不能回答您的问题,请对我进行评论,我可以相应地更改我的答案。全部修复,它在scrape()
中抓取了错误的URL。虽然我只是在将状态代码记录到控制台后才知道这一点:
//TASK: Create a command line application that goes to an ecommerce site to get the latest prices.
//Save the scraped data in a spreadsheet (CSV format).
'use strict';
//Modules being used:
var cheerio = require('cheerio');
var json2csv = require('json2csv');
var request = require('request');
var moment = require('moment');
var fs = require('fs');
//harcoded url
var urlHome = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = [];
var tshirtArray = [];
const requestPromise = function(url) {
return new Promise(function(resolve, reject) {
request(url, function(error, response, html) {
if(error) {
errorHandler(error);
return reject(error);
}
if(!error && response.statusCode == 200){
return resolve(html);
}
if(response.statusCode !== 200){
console.log("response code is " + response.statusCode);
}
return resolve("");
});
});
}
// Go into webpage via url, load html and grab links shirt in url
function scrape (url) {
console.log("Currently scraping " + url)
return requestPromise(url)
.then(function(html) {
var $ = cheerio.load(html);
var links = [];
var URL = 'http://shirts4mike.com/';
//get all the links
$('a[href*=shirt]').each(function(){
var a = $(this).attr('href');
//add into link array
links.push(URL + a);
});
// return array of links
return links;
});
}
function nextStep (arrayOfLinks) {
var promiseArray = [];
console.log(arrayOfLinks);
for(var i = 0; i < arrayOfLinks.length; i++){
promiseArray.push(requestPromise(arrayOfLinks[i]));
}
//return both the html of pages and their urls
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return {arrayOfHtml: arrayOfHtml , arrayOfUrls: arrayOfLinks};
});
}
//go through the html of each url and add to urlSet if there is a checkout button
//add to remainder otherwise to rescrape
function lastStep (obj){
for(var i = 0; i < obj.arrayOfHtml.length; i++){
var $ = cheerio.load(obj.arrayOfHtml[i]);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.push(obj.arrayOfUrls[i]);
console.log(obj.arrayOfUrls[i]);
} else if(remainder == undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
var remainder = obj.arrayOfUrls[i];
console.log("The remainder is " + remainder)
}
}
//return remainder for second run-through of scrape
return remainder;
}
//iterate through urlSet (product pages and grab html)
function lastScraperPt1(){
//call lastScraper so we can grab data from the set (product pages)
//scrape set, product pages
var promiseArray = [];
for(var item of urlSet){
var url = item;
promiseArray.push(requestPromise(url));
}
return Promise.all(promiseArray)
.then(function(arrayOfHtml){
return arrayOfHtml;
});
}
//iterate over the html of the product pages and store data as objects
function lastScraperPt2(html){
for(var i = 0; i < html.length; i++){
var $ = cheerio.load(html[i]);
//grab data and store as variables
var price = $('.price').text();
var imgURL = $('.shirt-picture').find('img').attr('src');
var title = $('body').find('.shirt-details > h1').text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.Title = title;
tshirtObject.Price = price;
tshirtObject.ImageURL = urlHome + imgURL;
tshirtObject.URL = urlSet[i];
tshirtObject.Date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
return tshirtArray;
}
//conver tshirt objects and save as CSV file
function convertJson2Csv(tshirtArray){
//The scraper should generate a folder called `data` if it doesn’t exist.
var dir ='./data';
if(!fs.existsSync(dir)){
fs.mkdirSync(dir);
}
var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Date'];
//convert tshirt data into CSV and pass in fields
var csv = json2csv({ data: tshirtArray, fields: fields });
//Name of file will be the date
var fileDate = moment().format('MM-DD-YY');
var fileName = dir + '/' + fileDate + '.csv';
//Write file
fs.writeFile(fileName, csv, {overwrite: true}, function(err) {
console.log('file saved');
if (err) errorHandler(err);
});
}
scrape(urlHome) //scrape from original entry point
.then(nextStep)
.then(lastStep)
.then(scrape)
.then(nextStep)
.then(lastStep)
.then(lastScraperPt1)
.then(lastScraperPt2)
.then(convertJson2Csv)
.catch(function(err) {
// handle any error from any request here
console.log(err);
});
//If the site is down, an error message describing the issue should appear in the console.
//This is to be tested by disabling wifi on your device.
//When an error occurs log it to a file scraper-error.log . It should append to the bottom of the file with a time stamp and error
var errorHandler = function (error) {
console.log(error.message);
console.log('The scraper could not not scrape data from ' + url + ' there is either a problem with your internet connection or the site may be down');
/**
* create new date for log file
*/
var loggerDate = new Date();
/**
* create message as a variable
*/
var errLog = '[' + loggerDate + '] ' + error.message + '\n';
/**
*when the error occurs, log that to the error logger file
*/
fs.appendFile('scraper-error.log', errLog, function (err) {
if (err) throw err;
console.log('There was an error. The error was logged to scraper-error.log');
});
};
//任务:创建一个命令行应用程序,用于访问电子商务站点以获取最新价格。
//在电子表格(CSV格式)中保存刮取的数据。
"严格使用",;
//正在使用的模块:
var cheerio=需要('cheerio');
var json2csv=require('json2csv');
var请求=要求(“请求”);
var力矩=要求的(‘力矩’);
var fs=需要('fs');
//加密url
var urlH