Python 使用scrapy时出错
我用python编写了以下代码:Python 使用scrapy时出错,python,web-scraping,scrapy,scrapy-spider,Python,Web Scraping,Scrapy,Scrapy Spider,我用python编写了以下代码: import scrapy from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor from site_auto_1.items import AutoItem class AutoSpider(CrawlSpider): name = "auto" allowed_host =
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from site_auto_1.items import AutoItem
class AutoSpider(CrawlSpider):
name = "auto"
allowed_host = ["autowereld.nl"]
url = "http://www.autowereld.nl/"
start_urls = [
"http://www.autowereld.nl/zoeken.html?mrk=187&mdl%5B%5D=463&prvan=500&prtot=3000&brstf%5B%5D=2&bjvan=2000&bjtot=2004&geoloc=&strl=&trns%5B%5D=&kmvan=&kmtot=&klr%5B%5D=&q=",
]
path = '//*[@id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/@href'
rules = (
Rule(
LinkExtractor(restrict_xpaths='//*[@id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/@href'),
callback='parse_item',
),
)
def parse_item(self, response):
print "found item :', response.url
这给了我一个错误:
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 638, in _tick
taskObj._oneWorkUnit()
File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 484, in _oneWorkUnit
result = next(self._iterator)
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
work = (callable(elem, *args, **named) for elem in iterable)
--- <exception caught here> ---
File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
yield next(it)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 26, in process_spider_output
for x in result:
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
return (r for r in result or () if _filter(r))
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
for request_or_item in self._requests_to_follow(response):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 107, in extract_links
links = self._extract_links(doc, response.url, response.encoding, base_url)
File "/usr/lib/pymodules/python2.7/scrapy/linkextractor.py", line 94, in _extract_links
return self.link_extractor._extract_links(*args, **kwargs)
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 50, in _extract_links
for el, attr, attr_val in self._iter_links(selector._root):
File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/lxmlhtml.py", line 38, in _iter_links
for el in document.iter(etree.Element):
exceptions.AttributeError: 'str' object has no attribute 'iter'
但是我不知道我做错了什么,我试着让
restrict\u xpaths
成为一个列表,一个元组。。。我是scrapy的新手,我无法理解…在restict\u xpaths
中配置的XPath应该指向元素,而不是属性
替换:
//*[@id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/@href
与:
天哪,我在shell中使用了该属性以确保它正常工作,但我忘了删除它,哦,我得到了,谢谢,我搜索了一个小时左右,直到我发布了这个问题,非常感谢!
//*[@id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a/@href
//*[@id="content-inhoud"]/div/div/table/tbody/tr/td/h3/a