Javascript CasperJS循环或迭代多个网页?
我有一个CasperJS脚本,可以从一个网页上抓取收视率和日期。现在我想从同一个网站下的多个页面中获取相同的数据。在给定以下代码的情况下,如何循环使用不同的子页面:Javascript CasperJS循环或迭代多个网页?,javascript,loops,foreach,web-scraping,casperjs,Javascript,Loops,Foreach,Web Scraping,Casperjs,我有一个CasperJS脚本,可以从一个网页上抓取收视率和日期。现在我想从同一个网站下的多个页面中获取相同的数据。在给定以下代码的情况下,如何循环使用不同的子页面: var ratings = []; var dates = []; var casper = require('casper').create({ pageSettings: { loadImages: false, loadPlugins: false
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm', function(){
this.echo('hi');
});
casper.then(function() {
ratings = this.evaluate(getRatings);
dates = this.evaluate(getDate);
this.echo(ratings);
});
casper.run(function() {
this.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
this.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'w');
this.echo(dates.length + ' dates found:').exit();
});
var评级=[];
var日期=[];
var casper=require('casper')。创建({
页面设置:{
loadImages:false,
loadPlugins:false
},
日志级别:“调试”,
详细:正确
});
var fs=需要('fs');
函数getRatings(){
var评级=document.queryselectoral(“#bvrratingoverall_Review_Display>div.bvrratingnormalimage>img”);
返回数组.原型.映射.调用(评级,函数(e){
返回e.getAttribute('title');
});
}
函数getDate(){
变量日期=document.querySelectorAll(“#BVSubmissionPopupContainer>div.bvrreviewDisplayStyle5Header>div.bvrreviewDateContainer>span.BVRRValue.bvrreviewDate”);
返回Array.prototype.map.call(日期,函数(e){
返回e.innerHTML;
});
}
卡斯珀,开始http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm',函数(){
这个.echo('hi');
});
casper.then(函数(){
评级=此。评估(getRatings);
dates=这个。评估(getDate);
这是echo(评级);
});
casper.run(函数(){
this.echo(ratings.length+“ratings found:”);
对于(var i=0;i此代码可以帮助您:
在对象数组中定义所需的URL、每个页面的选择器,并在循环中使用这些属性执行所需操作
您也可以在循环中使用click方法而不是url
var navigation = [
{
url: 'http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img', selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
,{
url: 'yourSecondUrl, etc...',
selectorRatings:'#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img',
selectorDate :'#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate'
}
],
content = "";
casper.start()
.then(function(){
//loop on the array
navigation.forEach(function(navIndex){
//open url : property url
casper.thenOpen(navIndex.url)
//wait for the page to load -> must be useless because thenOpen() do it
.waitForUrl(navIndex.url, function(){
//get the value of attribute title of adequate selector
var ratings = this.getElementAttribute(navIndex.selectorRatings, 'title'),
//get the HTML of adequate selector
var dates = this.getHTML(navIndex.selectorDates);
this.echo(ratings);
this.echo(dates);
content = content + ' ' + ratings + ' ' + dates;
});
});
})
.run(function() {
this.echo('----------- All steps done ------------\n');
this.exit();
});
由于存在“下一页”按钮,您可以使用它递归遍历所有页面:
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
ratings[i] = ratings[i]+': '+dates[i];
dates[i] = '';
}
casper.echo(ratings);
var content = ratings;
content = content.join("\n");
fs.write("C:/Users/Karan/Copy/tweesis/implementation/scraping/samsungratings.txt", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/samsung-galaxy-s-5.html?bvrrp=9060/reviews/product/1/598aea53-16d0-4c12-b53a-105157092c52.htm');
casper.then(getRatingsAndWrite);
casper.run();
函数getRatingsAndWrite(){
评级=casper.evaluate(getRatings);
日期=casper.evaluate(getDate);
casper.echo(评级);
casper.echo(ratings.length+'ratings found:');
对于(var i=0;i感谢Fanch和Artjom B。您的两个答案都提供了有效的解决方案。我使用了Artjom B给出的分页“下一页”的递归遍历。接下来,我添加了一个wait()函数,以确保在删除下一页之前加载了下一页评级。没有这个wait()函数,我们在单击“下一步”和相应的下一页加载完成之间多次刮取同一页。请参阅下面的工作代码:
var ratings = [];
var dates = [];
var casper = require('casper').create({
pageSettings: {
loadImages: false,
loadPlugins: false
},
logLevel: "debug",
verbose: true
});
var fs = require('fs');
function getRatings() {
var ratings = document.querySelectorAll('#BVRRRatingOverall_Review_Display > div.BVRRRatingNormalImage > img');
return Array.prototype.map.call(ratings, function(e) {
return e.getAttribute('title');
});
}
function getDate() {
var dates = document.querySelectorAll('#BVSubmissionPopupContainer > div.BVRRReviewDisplayStyle5Header > div.BVRRReviewDateContainer > span.BVRRValue.BVRRReviewDate');
return Array.prototype.map.call(dates, function(e) {
return e.innerHTML;
});
}
function getRatingsAndWrite(){
ratings = casper.evaluate(getRatings);
dates = casper.evaluate(getDate);
casper.echo(ratings.length + ' ratings found:');
for(var i=0; i<ratings.length; i++){
var rating = ratings[i].substr(0,1);
ratings[i] = rating +': '+dates[i];
dates[i] = '';
}
var content = ratings;
content = content.join("\n");
fs.write("<filepath to write content>", content, 'a');
casper.echo(dates.length + ' dates found:');
var nextLink = ".BVRRPageLink.BVRRNextPage > a";
if (casper.visible(nextLink)) {
casper.thenClick(nextLink);
casper.wait(3000);
casper.then(getRatingsAndWrite);
} else {
casper.echo("END")
}
}
casper.start('http://www.t-mobile.com/cell-phones/htc-one-m8.html');
casper.then(getRatingsAndWrite);
casper.run();
var评级=[];
var日期=[];
var casper=require('casper')。创建({
页面设置:{
loadImages:false,
loadPlugins:false
},
日志级别:“调试”,
详细:正确
});
var fs=需要('fs');
函数getRatings(){
var评级=document.queryselectoral(“#bvrratingoverall_Review_Display>div.bvrratingnormalimage>img”);
返回数组.原型.映射.调用(评级,函数(e){
返回e.getAttribute('title');
});
}
函数getDate(){
变量日期=document.querySelectorAll(“#BVSubmissionPopupContainer>div.bvrreviewDisplayStyle5Header>div.bvrreviewDateContainer>span.BVRRValue.bvrreviewDate”);
返回Array.prototype.map.call(日期,函数(e){
返回e.innerHTML;
});
}
函数getRatingsAndWrite(){
评级=casper.evaluate(getRatings);
日期=casper.evaluate(getDate);
casper.echo(ratings.length+'ratings found:');
对于(var i=0;iHi,是的,我说的是分页评级。接下来的页面可以通过:/598aea53-16d0-4c12-b53a-105157092c52.html访问。问题是页面比页面链接多,因此需要递归执行此操作。是的,我在发布之前没有查看他的url,但他仍然可以组合单击()+等待()。使用计数器。是的,这是可能的,但您需要至少嵌套一次步骤,因为您无法在不知道页数的情况下安排单击+等待。不幸的是,这对我不起作用。我的等待超时已过期5000毫秒,正在退出。
我尝试将其更改为20秒,但仍然出现相同的错误。