Python Tripadvisor上杂乱无章的爬行评论:如何应用双递归规则?
这就是我的蜘蛛的样子:Python Tripadvisor上杂乱无章的爬行评论:如何应用双递归规则?,python,web-scraping,scrapy,scrapy-spider,Python,Web Scraping,Scrapy,Scrapy Spider,这就是我的蜘蛛的样子: from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor from scrapy import Selector from scrapy.http import Request from tripadvisor.items import TripadvisorItem,Tripadviso
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy import Selector
from scrapy.http import Request
from tripadvisor.items import TripadvisorItem,TripadvisorItem2
from tripadvisor.id_generator import id_maker
from scrapy import log
class TripadvisorSpider(CrawlSpider):
name ="trippy5"
allowed_domanis=["tripadvisor.com"]
start_urls=[
'http://www.tripadvisor.com/Hotels-g187371-Cologne_North_Rhine_Westphalia-Hotels.html']
rules = (Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext "]')),
callback="parse_start_url", follow= True),
Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext pid4181"]')),
callback="parse_hotel", follow= True),)
def __init__(self, *a, **kw):
super(TripadvisorSpider, self).__init__(*a, **kw)
self._compile_rules()
logfile = open('tripadvisor_log.log', 'w')
log_observer = log.ScrapyFileLogObserver(logfile, level=log.DEBUG)
log_observer.start()
def parse_start_url(self, response):
sel = Selector(response)
titles = sel.xpath('//div[@class="metaLocationInfo"]')
for title in titles:
hotel_id=id_maker()
item = TripadvisorItem()
item['id_hotel']=hotel_id
item ["name"] = (title.xpath('.//a[@class="property_title"]/text()').extract()[0]).strip()
star=title.xpath('.//img[@class="sprite-ratings-gry"]/@alt').extract()
if star:
star=star[0]
star=star[0:3]
item ["stars"] =star
else:
item ["stars"] =999
overall_rating=title.xpath('.//img[@class="sprite-ratings"]/@alt').extract()
if overall_rating:
overall_rating=overall_rating[0]
if overall_rating[1]=='.':
overall_rating=overall_rating[0:3]
else:
overall_rating=overall_rating[0]
item ["overall_rating"]=overall_rating
else:
item ["overall_rating"]=999
link=title.xpath('.//a[@class="property_title"]/@href').extract()[0]
url = "http://tripadvisor.com{}".format(link)
yield Request(url=url, meta={'item': item}, callback=self.parse_hotel)
def parse_hotel(self, response):
items=[]
item = response.meta['item']
sel = Selector(response)
review_boxes =sel.xpath('//*[@class="reviewSelector "]')
if review_boxes:
for box in review_boxes:
if box.xpath('.//span[@class="noQuotes"]/text()').extract():
item2=TripadvisorItem2()
item2 ['id_hotel']=item['id_hotel']
item2 ["hotel_name"]=item ["name"]
item2 ["review_title"] = box.xpath('.//span[@class="noQuotes"]/text()').extract()[0]
item2 ["review_rate"] =box.xpath('.//div/div/span/img/@alt').extract()[0][0]
if box.xpath('.//div/div/span/@title').extract():
item2 ["review_date"] =box.xpath('.//div/div/span/@title').extract()[0]
else:
item2 ["review_date"] =(box.xpath('.//div/div/span[@class="ratingDate"]/text()').extract()[0]).replace('Reviewed ','')
contributor_name=box.xpath('.//div[@class="username mo"]/span/text()').extract()
if contributor_name:
item2 ["contributor_name"] =contributor_name[0]
location=box.xpath('.//div[@class="location"]/text()').extract()
if location:
if location==[u'\n']:
item2 ["contributor_location"]='N/A'
else:
item2 ["contributor_location"]=(location[0]).strip()
overall_contributions=box.xpath('.//span[@class="badgeText"]/text()').extract()
if overall_contributions:
item2 ["overall_contributions"] =((overall_contributions[0]).replace(' review','')).replace('s','')
else:
item2 ["overall_contributions"] =999
item2 ["text"] =(box.xpath('.//div[@class="entry"]/p/text()').extract())[0].strip()
items.append(item2)
yield item2
第一条规则很成功,可以在列出酒店的所有页面中爬行
蜘蛛只能在每家酒店评论的第一页上爬行,不幸的是,它忽略了第二条规则,第二条规则应该允许它在所有评论的页面上递归爬行
我不认为这适用于这里,因为不同的回调和不同的XPath
我求救