Python 我想使用“下一步”按钮链接遍历所有页面以获取所有项目_Python_Web Scraping_Scrapy

Python 我想使用“下一步”按钮链接遍历所有页面以获取所有项目

python web-scraping scrapy

Python 我想使用“下一步”按钮链接遍历所有页面以获取所有项目,python,web-scraping,scrapy,Python,Web Scraping,Scrapy,我在规则中得到了一个错误：Unhasable type:list，我已经定义该规则来提取下一个按钮链接 from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider,Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

我在规则中得到了一个错误：Unhasable type:list，我已经定义该规则来提取下一个按钮链接

from scrapy.http import Request

from scrapy.selector import HtmlXPathSelector

from scrapy.contrib.spiders import CrawlSpider,Rule

from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor

from walmart_sample.items import WalmartSampleItem


class MySpider(CrawlSpider):

    name = "my_spider"
    domain = ['Apparel']
    keyword = 'Bags'
    departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
    allowed_domains = ['walmart.com']
    denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
    rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[@class="btn-nextResults"]'),callback='parse',follow=True),)


    def start_requests(self):
        for domains in self.domain:
            if domains in self.departments:
                url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domains))
                yield Request(url)


    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        links = hxs.select('//a[@class="prodLink ListItemLink"]/@href')
        last = hxs.select('//a[@class="SPPagNoLink jump next"]').extract()
        if last is None:
            for link in links:
                href = link.extract()
                yield Request('http://www.walmart.com/' + href, self.parse_data) 
        else:
            print "<<<<<Last Page>>>>>>"


    def parse_data(self, response):
        hxs = HtmlXPathSelector(response)
        items=[]
        walmart=WalmartSampleItem()
        walmart['Title']=hxs.select('//h1[@class="productTitle"]/text()').extract()
        walmart['Price']=hxs.select('//span[@class="bigPriceText1"]/text()').extract()+hxs.select('//span[@class="smallPriceText1"]/text()').extract()
        walmart['Availability']=hxs.select('//span[@id="STORE_AVAIL"]/text()').extract()
        walmart['Description']=hxs.select('//span[@class="ql-details-short-desc"]/p/text()').extract()
       #walmart['Avg_Rating']=
       #walmart['Detailed_Rating']=
        items.append(walmart)
        return items

回溯最近一次呼叫上次：

  File "/usr/bin/scrapy", line 4, in <module>
    execute()

  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
    _run_print_help(parser, _run_command, cmd, args, opts)

  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
    func(*a, **kw)

  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
    cmd.run(args, opts)

  File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
    crawler = self.crawler_process.create_crawler()

  File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
    self.crawlers[name] = Crawler(self.settings)

  File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
    self.spiders = spman_cls.from_crawler(self)

  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
    sm = cls.from_settings(crawler.settings)

  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
    return cls(settings.getlist('SPIDER_MODULES'))

  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
    for module in walk_modules(name):

  File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
    submod = import_module(fullpath)

  File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
    __import__(name)
  File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 8, in <module>
    class MySpider(CrawlSpider):
  File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 15, in MySpider
    rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[@class="btn-nextResults"]'),callback='parse',follow=True),)   


TypeError: unhashable type: 'list'

问题在于：

departments.get(domain)

域是一个列表，因此您需要指定列表中要使用的单个项。在这种情况下，使用域[0]修复了问题，您的规则变成：

rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain[0]))),),restrict_xpaths=('//li[@class="btn-nextResults"]'),callback='parse',follow=True),)

你能粘贴你的堆栈跟踪以便我们知道错误的来源吗？我已经添加了错误截图！！我的主要问题是在“下一步”按钮的链接后重复页面。感谢您的解决方案，但这并不能解决我的问题。如果我的答案修复了错误，请使用代码更改更新问题，并提供当前出现的问题的详细信息：很抱歉，但我仍然有上述评论中提到的问题！！即使使用你的补丁也帮不了我我只是添加了一个if语句，就解决了这个不好的问题……但是接下来的几页我还在看，但是如果这是一个爬行器，那么你应该避免重写解析回调，因为爬行器在内部使用它。另外，在扩展代码之前，最好从简单的开始，使用服装作为硬编码部门来验证您的规则是否正确。