Python Tripadvisor上杂乱无章的爬行评论：如何应用双递归规则？_Python_Web Scraping_Scrapy_Scrapy Spider

Python Tripadvisor上杂乱无章的爬行评论：如何应用双递归规则？

python web-scraping scrapy

Python Tripadvisor上杂乱无章的爬行评论：如何应用双递归规则？,python,web-scraping,scrapy,scrapy-spider,Python,Web Scraping,Scrapy,Scrapy Spider,这就是我的蜘蛛的样子： from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor from scrapy import Selector from scrapy.http import Request from tripadvisor.items import TripadvisorItem,Tripadviso

这就是我的蜘蛛的样子：

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy import Selector
from scrapy.http import Request
from tripadvisor.items import TripadvisorItem,TripadvisorItem2
from tripadvisor.id_generator import id_maker
from scrapy import log

class TripadvisorSpider(CrawlSpider):
    name ="trippy5"
    allowed_domanis=["tripadvisor.com"] 
    start_urls=[
    'http://www.tripadvisor.com/Hotels-g187371-Cologne_North_Rhine_Westphalia-Hotels.html']

    rules = (Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext "]')),
            callback="parse_start_url",  follow= True),
           Rule(LxmlLinkExtractor(allow=(),restrict_xpaths=('//*[@class="guiArw sprite-pageNext  pid4181"]')),
            callback="parse_hotel",  follow= True),)


    def __init__(self, *a, **kw):
        super(TripadvisorSpider, self).__init__(*a, **kw)
        self._compile_rules()
        logfile = open('tripadvisor_log.log', 'w')
        log_observer = log.ScrapyFileLogObserver(logfile, level=log.DEBUG)
        log_observer.start()
    def parse_start_url(self, response):

        sel = Selector(response)
        titles = sel.xpath('//div[@class="metaLocationInfo"]')
        for title in titles:
            hotel_id=id_maker()
            item = TripadvisorItem()
            item['id_hotel']=hotel_id
            item ["name"] = (title.xpath('.//a[@class="property_title"]/text()').extract()[0]).strip()
            star=title.xpath('.//img[@class="sprite-ratings-gry"]/@alt').extract()

            if star:
                star=star[0]       
            star=star[0:3]                
            item ["stars"] =star 
        else:
            item ["stars"] =999

            overall_rating=title.xpath('.//img[@class="sprite-ratings"]/@alt').extract()
            if overall_rating:
                overall_rating=overall_rating[0]
                if overall_rating[1]=='.':
                    overall_rating=overall_rating[0:3]
                else:
                    overall_rating=overall_rating[0]

                item ["overall_rating"]=overall_rating
            else:
                item ["overall_rating"]=999


            link=title.xpath('.//a[@class="property_title"]/@href').extract()[0]            
            url = "http://tripadvisor.com{}".format(link)

            yield Request(url=url, meta={'item': item}, callback=self.parse_hotel)    


    def parse_hotel(self, response):
        items=[]
        item = response.meta['item']
        sel = Selector(response)

        review_boxes =sel.xpath('//*[@class="reviewSelector "]')        
        if review_boxes:            
            for box in review_boxes:
                if box.xpath('.//span[@class="noQuotes"]/text()').extract():
                    item2=TripadvisorItem2()
                    item2 ['id_hotel']=item['id_hotel']
                    item2 ["hotel_name"]=item ["name"]
                    item2 ["review_title"] = box.xpath('.//span[@class="noQuotes"]/text()').extract()[0]
                    item2 ["review_rate"] =box.xpath('.//div/div/span/img/@alt').extract()[0][0]                  
                    if box.xpath('.//div/div/span/@title').extract():                    
                        item2 ["review_date"] =box.xpath('.//div/div/span/@title').extract()[0]                    
                    else:                        
                        item2 ["review_date"] =(box.xpath('.//div/div/span[@class="ratingDate"]/text()').extract()[0]).replace('Reviewed ','')                  
                    contributor_name=box.xpath('.//div[@class="username mo"]/span/text()').extract()
                    if contributor_name:
                        item2 ["contributor_name"] =contributor_name[0]
                    location=box.xpath('.//div[@class="location"]/text()').extract()                   
                    if location:
                        if location==[u'\n']:
                            item2 ["contributor_location"]='N/A' 
                        else:
                            item2 ["contributor_location"]=(location[0]).strip()
                    overall_contributions=box.xpath('.//span[@class="badgeText"]/text()').extract()
                    if overall_contributions:
                        item2 ["overall_contributions"] =((overall_contributions[0]).replace(' review','')).replace('s','')
                    else:
                        item2 ["overall_contributions"] =999
                    item2 ["text"] =(box.xpath('.//div[@class="entry"]/p/text()').extract())[0].strip()
                    items.append(item2)

                    yield item2

第一条规则很成功，可以在列出酒店的所有页面中爬行

蜘蛛只能在每家酒店评论的第一页上爬行，不幸的是，它忽略了第二条规则，第二条规则应该允许它在所有评论的页面上递归爬行

我不认为这适用于这里，因为不同的回调和不同的XPath

我求救