Python 我想使用“下一步”按钮链接遍历所有页面以获取所有项目
我在规则中得到了一个错误:Unhasable type:list,我已经定义该规则来提取下一个按钮链接Python 我想使用“下一步”按钮链接遍历所有页面以获取所有项目,python,web-scraping,scrapy,Python,Web Scraping,Scrapy,我在规则中得到了一个错误:Unhasable type:list,我已经定义该规则来提取下一个按钮链接 from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from scrapy.contrib.spiders import CrawlSpider,Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from walmart_sample.items import WalmartSampleItem
class MySpider(CrawlSpider):
name = "my_spider"
domain = ['Apparel']
keyword = 'Bags'
departments = {"All Departments": "0", "Apparel": "5438", "Auto": "91083", "Baby": "5427", "Beauty": "1085666","Books": "3920", "Electronics": "3944", "Gifts": "1094765", "Grocery": "976759", "Health": "976760","Home": "4044", "Home Improvement": "1072864", "Jwelery": "3891", "Movies": "4096", "Music": "4104","Party": "2637", "Patio": "5428", "Pets": "5440", "Pharmacy": "5431", "Photo Center": "5426","Sports": "4125", "Toys": "4171", "Video Games": "2636"}
allowed_domains = ['walmart.com']
denied_domains = ['reviews.walmart.com','facebook.com','twitter.com']
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[@class="btn-nextResults"]'),callback='parse',follow=True),)
def start_requests(self):
for domains in self.domain:
if domains in self.departments:
url = 'http://www.walmart.com/search/search-ng.do?search_query=%s&ic=16_0&Find=Find&search_constraint=%s' % (self.keyword, self.departments.get(domains))
yield Request(url)
def parse(self, response):
hxs = HtmlXPathSelector(response)
links = hxs.select('//a[@class="prodLink ListItemLink"]/@href')
last = hxs.select('//a[@class="SPPagNoLink jump next"]').extract()
if last is None:
for link in links:
href = link.extract()
yield Request('http://www.walmart.com/' + href, self.parse_data)
else:
print "<<<<<Last Page>>>>>>"
def parse_data(self, response):
hxs = HtmlXPathSelector(response)
items=[]
walmart=WalmartSampleItem()
walmart['Title']=hxs.select('//h1[@class="productTitle"]/text()').extract()
walmart['Price']=hxs.select('//span[@class="bigPriceText1"]/text()').extract()+hxs.select('//span[@class="smallPriceText1"]/text()').extract()
walmart['Availability']=hxs.select('//span[@id="STORE_AVAIL"]/text()').extract()
walmart['Description']=hxs.select('//span[@class="ql-details-short-desc"]/p/text()').extract()
#walmart['Avg_Rating']=
#walmart['Detailed_Rating']=
items.append(walmart)
return items
回溯最近一次呼叫上次:
File "/usr/bin/scrapy", line 4, in <module>
execute()
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
crawler = self.crawler_process.create_crawler()
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
self.crawlers[name] = Crawler(self.settings)
File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
self.spiders = spman_cls.from_crawler(self)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
sm = cls.from_settings(crawler.settings)
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
return cls(settings.getlist('SPIDER_MODULES'))
File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
for module in walk_modules(name):
File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
submod = import_module(fullpath)
File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
__import__(name)
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 8, in <module>
class MySpider(CrawlSpider):
File "/home/vivek/mywork/walmart_sample/walmart_sample/spiders/test.py", line 15, in MySpider
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain))),),restrict_xpaths=('//li[@class="btn-nextResults"]'),callback='parse',follow=True),)
TypeError: unhashable type: 'list'
问题在于:
departments.get(domain)
域是一个列表,因此您需要指定列表中要使用的单个项。在这种情况下,使用域[0]修复了问题,您的规则变成:
rules = (Rule(SgmlLinkExtractor(allow=("http://www.walmart.com/search/search-ng.do?tab_value=all&search_query=%s&search_constraint=%s&Find=Find&pref_store=1801&ss=false&ic=16_\d*2&_mm=" %(keyword,departments.get(domain[0]))),),restrict_xpaths=('//li[@class="btn-nextResults"]'),callback='parse',follow=True),)
你能粘贴你的堆栈跟踪以便我们知道错误的来源吗?我已经添加了错误截图!!我的主要问题是在“下一步”按钮的链接后重复页面。感谢您的解决方案,但这并不能解决我的问题。如果我的答案修复了错误,请使用代码更改更新问题,并提供当前出现的问题的详细信息:很抱歉,但我仍然有上述评论中提到的问题!!即使使用你的补丁也帮不了我我只是添加了一个if语句,就解决了这个不好的问题……但是接下来的几页我还在看,但是如果这是一个爬行器,那么你应该避免重写解析回调,因为爬行器在内部使用它。另外,在扩展代码之前,最好从简单的开始,使用服装作为硬编码部门来验证您的规则是否正确。