Javascript 我怎样才能用承诺来重写这个?
我正在为tshirt网站构建内容刮板 目标是只通过一个硬编码url进入网站:Javascript 我怎样才能用承诺来重写这个?,javascript,node.js,web-scraping,promise,Javascript,Node.js,Web Scraping,Promise,我正在为tshirt网站构建内容刮板 目标是只通过一个硬编码url进入网站:http://shirts4mike.com 然后我会找到每件T恤的所有产品页面,然后创建一个包含详细信息的对象。然后将其添加到数组中 当阵列充满T恤时,我将遍历阵列并将其记录到CSV文件中 现在,我在请求/响应和函数调用的计时方面遇到了一些问题 如何确保在正确的时间调用下一个函数?我知道它不工作是因为它是异步的 如何在正确的时间调用secondScrape、lastspraper和convertJson2Csv,使它们
http://shirts4mike.com
然后我会找到每件T恤的所有产品页面,然后创建一个包含详细信息的对象。然后将其添加到数组中
当阵列充满T恤时,我将遍历阵列并将其记录到CSV文件中
现在,我在请求/响应和函数调用的计时方面遇到了一些问题
如何确保在正确的时间调用下一个函数?我知道它不工作是因为它是异步的
如何在正确的时间调用secondScrape
、lastspraper
和convertJson2Csv
,使它们处理的变量不被定义
我试图使用response.end()
之类的方法,但这不起作用
我想我需要用承诺来让它正常工作?而且要清晰易读
有什么想法吗?我的代码如下:
//Modules being used:
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray;
// Load front page of shirts4mike
request(url, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if($('[type=submit]').length !== 0){
//add page to set
urlSet.add(scrapeLink);
} else if(remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
}
}
});
});
}
//call second scrape for remainder
secondScrape();
});
function secondScrape() {
request(remainder, function(error, response, html) {
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function(){
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error,response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if($('[type=submit]').length !== 0){
urlSet.add(scrapeLink);
}
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
lastScraper();
};
function lastScraper(){
//scrape set, product pages
for(var i = 0; i < urlSet.length; i++){
var url = urlSet[i];
request(url, function(error, response, html){
if(!error && response.statusCode == 200){
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
};
//正在使用的模块:
var cheerio=需要('cheerio');
var请求=要求(“请求”);
var力矩=要求的(‘力矩’);
//硬编码url
var url='1〕http://shirts4mike.com/';
//tshirt页面的url
var urlSet=new Set();
var余数;
塔尔雷;
//加载shirts4mike的首页
请求(url、函数(错误、响应、html){
如果(!error&&response.statusCode==200){
var$=cheerio.load(html);
//使用“shirt”迭代链接
$(“a[href*=shirt]”。每个(函数(){
var a=$(this.attr('href');
//创建新链接
var scrapeLink=url+a;
//对于每个新链接,进入并查看是否有提交按钮。
//如果有,将其添加到集合中
请求(链接、函数(错误、响应、html){
如果(!error&&response.statusCode==200){
var$=cheerio.load(html);
//如果页面有提交,则必须是产品页面
如果($('[type=submit]')。长度!==0){
//添加要设置的页面
添加(scrapeLink);
}else if(余数===未定义){
//如果不是产品页面,则将其添加到余数中,以便可以执行另一次刮取。
余数=刮链;
}
}
});
});
}
//调用第二次刮取剩余部分
第二次刮伤();
});
函数secondScrape(){
请求(余数、函数(错误、响应、html){
如果(!error&&response.statusCode==200){
var$=cheerio.load(html);
$(“a[href*=shirt]”。每个(函数(){
var a=$(this.attr('href');
//创建新链接
var scrapeLink=url+a;
请求(链接、函数(错误、响应、html){
如果(!error&&response.statusCode==200){
var$=cheerio.load(html);
//收集剩余的产品页面并添加到集合
如果($('[type=submit]')。长度!==0){
添加(scrapeLink);
}
}
});
});
}
});
console.log(urlSet);
//调用lastScraper,以便我们可以从集合(产品页面)中获取数据
最后的刮刀();
};
函数({
//刮集,产品页
对于(var i=0;ih1”).text().slice(4);
var tshirtObject={};
//将值添加到tshirt对象中
tshirtObject.price=价格;
tshirtObject.img=img;
tshirtObject.title=标题;
tshirtObject.url=url;
tshirtObject.date=moment().format('MMMM-Do-YYYY,h:mm:ss-a');
//将该对象添加到T恤阵列中
推(齐尔托标);
}
});
}
//调用函数遍历数组中的tshirt对象,以便转换为JSON,然后转换为CSV进行记录
convertJson2Csv();
};
有一个名为npm的npm模块
简单地说:
var rp = require("request-promise");
无论你在哪里提出请求,你都可以通过请求承诺进行切换
例如:
rp(url)
.then(function(value){
//do whatever
})
.catch(function(err){
console.log(err)
})
您可以使用模块化的方法,它可以为您提供解决此问题的平滑方法
我只是试着用这个模块编写代码
希望这对你有用
瀑布的格式
async.waterfall([
function(callback) {
callback(null, previousvalue);
},
function(previousvalue, callback) {}
], function(err, result) { //Final callback
});
var async=require('async');
var cheerio=需要('cheerio');
var请求=要求(“请求”);
var力矩=要求的(‘力矩’);
//硬编码url
var url='1〕http://shirts4mike.com/';
//tshirt页面的url
var urlSet=new Set();
var余数;
var tshirtaray=[];
异步瀑布([
函数(回调){
//加载shirts4mike的首页
请求(url、函数(错误、响应、html){
如果(!error&&response.statusCode==200){
var$=cheerio.load(html);
//使用“shirt”迭代链接
$(“a[href*=shirt]”。每个(函数(){
var a=$(this.attr('href');
//创建新链接
var scrapeLink=url+a;
//对于每个新链接,进入并查看是否有提交按钮。
//如果有,将其添加到集合中
请求(链接、函数(错误、响应、html){
如果(!error&&response.statusCode==20
var async = require('async');
var cheerio = require('cheerio');
var request = require('request');
var moment = require('moment');
//hardcoded url
var url = 'http://shirts4mike.com/';
//url for tshirt pages
var urlSet = new Set();
var remainder;
var tshirtArray = [];
async.waterfall([
function(callback) {
// Load front page of shirts4mike
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//iterate over links with 'shirt'
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
//for each new link, go in and find out if there is a submit button.
//If there, add it to the set
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//if page has a submit it must be a product page
if ($('[type=submit]').length !== 0) {
//add page to set
urlSet.add(scrapeLink);
callback(null, true);
} else if (remainder === undefined) {
//if not a product page, add it to remainder so it another scrape can be performed.
remainder = scrapeLink;
callback(nul, true);
}
}
});
});
}
//call second scrape for remainder
// secondScrape();
});
},
function(previousvalue, callback) {
request(remainder, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$("a[href*=shirt]").each(function() {
var a = $(this).attr('href');
//create new link
var scrapeLink = url + a;
request(scrapeLink, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//collect remaining product pages and add to set
if ($('[type=submit]').length !== 0) {
urlSet.add(scrapeLink);
}
callback(null, true);
}
});
});
}
});
console.log(urlSet);
//call lastScraper so we can grab data from the set (product pages)
},
function(previousvalue, callback) {
//scrape set, product pages
for (var i = 0; i < urlSet.length; i++) {
var url = urlSet[i];
request(url, function(error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
//grab data and store as variables
var price = $('.price').text();
var img = $('.shirt-picture').find("img").attr("src");
var title = $('body').find(".shirt-details > h1").text().slice(4);
var tshirtObject = {};
//add values into tshirt object
tshirtObject.price = price;
tshirtObject.img = img;
tshirtObject.title = title;
tshirtObject.url = url;
tshirtObject.date = moment().format('MMMM Do YYYY, h:mm:ss a');
//add the object into the array of tshirts
tshirtArray.push(tshirtObject);
}
});
}
}
], function(err, result) {
//call function to iterate through tshirt objects in array in order to convert to JSON, then into CSV to be logged
convertJson2Csv();
});
promise = new Promise((resolve, reject) => (
request("http://shirts4mike.com/",
(err, response, html) => (response.statusCode == 200 ? resolve(html): reject(err))
)));
promise.then(html => {
var $ = cheerio.load(html);
// continue
});
//Modules being used:
var Promise = require('path/to/bluebird');
var cheerio = require('cheerio');
var moment = require('moment');
// Promisify `request` to make `request.getAsync()` available.
// Ref: http://stackoverflow.com/questions/28308131/how-do-you-properly-promisify-request
var request = Promise.promisify(require('request'));
Promise.promisifyAll(request);
//hardcoded url
var url = 'http://shirts4mike.com/';
var urlSet = new Set();
var tshirtArray = [];
var maxLevels = 3; // limit the recursion to this number of levels.
function scrapePage(url_, levelCounter) {
// Bale out if :
// a) the target url_ has been visited already,
// b) maxLevels has been reached.
if(urlSet.has(url_) || levelCounter >= maxLevels) {
return Promise.resolve();
}
urlSet.add(url_);
return request.getAsync(url_).then(function(response, html) {
var $;
if(response.statusCode !== 200) {
throw new Error('statusCode was not 200'); // will be caught below
}
$ = cheerio.load(html);
if($('[type=submit]').length > 0) {
// yay, it's a product page.
tshirtArray.push({
price: $('.price').text(),
img: $('.shirt-picture').find("img").attr("src"),
title: $('body').find(".shirt-details > h1").text().slice(4),
url: url_,
date: moment().format('MMMM Do YYYY, h:mm:ss a')
});
}
// find any shirt links on page represented by $, visit each link in turn, and scrape.
return Promise.all($("a[href*=shirt]").map(function(link) {
return scrapePage(link.href, levelCounter + 1);
}).get());
}).catch(function(e) {
// ensure "success" even if scraping threw an error.
console.log(e);
return null;
});
}
scrapePage(url, 0).then(convertJson2Csv);