Python Scrapy没有翻过这一页
іі!蜘蛛为什么不浏览网页?我使用规则。。。我做错了什么?Ⅰt仅在一页上起作用。代码如下:Python Scrapy没有翻过这一页,python,scrapy,Python,Scrapy,іі!蜘蛛为什么不浏览网页?我使用规则。。。我做错了什么?Ⅰt仅在一页上起作用。代码如下: # -*- encoding: -*- class JobSpider(CrawlSpider): name = 'superjob' allowed_domains = ['superjob.ru'] start_urls = [ 'http://www.superjob.ru/vacancy/search/?t%5B0%5D=4&sbmit=1&
# -*- encoding: -*-
class JobSpider(CrawlSpider):
name = 'superjob'
allowed_domains = ['superjob.ru']
start_urls = [
'http://www.superjob.ru/vacancy/search/?t%5B0%5D=4&sbmit=1&period=7'
]
rules = [
Rule(SgmlLinkExtractor(allow='/vacancy/search/?',
restrict_xpaths=(
u'//a[@class="h_border_none"]/<span>следующая</span>')),
callback='parse',
follow=True),
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select(
'//*[@id="ng-app"]/div[2]/div/div[2]/div/div[1]/div[2]/div/div/h2/a')
items = []
for title in titles:
item = JobItem()
item['title'] = title.select('//h2/a/text()').extract()
items.append(item)
# return items
#-*-编码:-*-
类作业爬行器(爬行爬行器):
名称='超级作业'
允许的_域=['superjob.ru']
起始URL=[
'http://www.superjob.ru/vacancy/search/?t%5B0%5D=4&sbmit=1&period=7'
]
规则=[
规则(SGMLLinkedExtractor(允许=“/Emptance/search/?”,
限制路径=(
u'//a[@class=“h_border_none”]/cааааааааа'),
callback='parse',
follow=True),
]
def解析(自我,响应):
hxs=HtmlXPathSelector(响应)
titles=hxs.select(
'//*[@id=“ng app”]/div[2]/div/div[2]/div/div[1]/div[2]/div/div/h2/a')
项目=[]
标题中的标题:
item=JobItem()
item['title']=title.select('//h2/a/text()')。extract()
items.append(项目)
#退货项目
5件需要解决的事情:
应指向分页块restrict\u xpath
- 应该调用
parse()
- 使用
,LinkExtractor
不推荐使用SgmlLinkExtractor
- 使用
而不是xpath()
response.xpath()快捷方式select()
- 修复内部XPath表达式-只需获取
text()
# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class JobSpider(CrawlSpider):
name = 'superjob'
allowed_domains = ['superjob.ru']
start_urls = [
'http://www.superjob.ru/vacancy/search/?t%5B0%5D=4&sbmit=1&period=7'
]
rules = [
Rule(LinkExtractor(allow='/vacancy/search/\?', restrict_xpaths=u'//div[@class="Paginator_navnums"]'),
callback='parse_item',
follow=True),
]
def parse_item(self, response):
titles = response.xpath('//*[@id="ng-app"]/div[2]/div/div[2]/div/div[1]/div[2]/div/div/h2/a')
for title in titles:
item = JobItem()
item['title'] = title.xpath('text()').extract()
yield item
非常感谢,现在意识到了!