Python 在scrapy中使用meta传递数据
我想将parse_secu_frame方法中的secu url传递给parse_frame,并同时生成所有三个值。在没有任何运气的情况下尝试了下面的方法,并且想不出任何其他逻辑。有什么建议吗?代码如下:Python 在scrapy中使用meta传递数据,python,web-scraping,scrapy,Python,Web Scraping,Scrapy,我想将parse_secu_frame方法中的secu url传递给parse_frame,并同时生成所有三个值。在没有任何运气的情况下尝试了下面的方法,并且想不出任何其他逻辑。有什么建议吗?代码如下: # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class CtSpider(
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CtSpider(CrawlSpider):
custom_settings = {
'CONCURRENT_REQUESTS': 1
}
name = 'ct'
allowed_domains = ['cartoon3rbi.net']
start_urls = ['https://www.cartoon3rbi.net/cartooncat-920.html']
#
rules = (
Rule(LinkExtractor(restrict_xpaths='//div[@class="pagination"]/a[4]'), follow=True),
Rule(LinkExtractor(restrict_xpaths='//div[@class="cartoon_eps_name"]'), callback='parse_item', follow=True),
)
def parse_item(self, response):
for el in response.xpath('//div[@id="topme"]'):
response.meta['NAME'] = el.xpath('//div[@class="block_title"]/text()').extract()
frames = el.xpath('//div[@class="embedcode"]/iframe/@src').extract()
for frame in frames:
yield scrapy.Request(url=frame, callback=self.parse_frame, meta=response.meta)
sec_link = frame.replace('1', '4')
yield scrapy.Request(url=sec_link, callback=self.parse_sec_frame)
def parse_sec_frame(self, response):
response.meta['url'] = response.xpath('//div[@id="picasa"]/iframe/@src').extract_first()
yield scrapy.Request(url=response.meta['url'], callback=self.parse_frame, meta=response.meta)
def parse_frame(self, response):
name = response.meta['NAME']
url = response.xpath('//div[@id="picasa"]/iframe/@src').extract_first()
sec_url = response.meta['url']
yield {
'Name': name,
'Url': url,
'sec_link': sec_url,
}
为什么要更改response.meta?如果您希望稍后生成请求,则需要创建新请求并使用request.meta。或者,您可以稍微修改代码:
def parse_item(self, response):
for el in response.xpath('//div[@id="topme"]'):
NAME = el.xpath('//div[@class="block_title"]/text()').extract()
frames = el.xpath('//div[@class="embedcode"]/iframe/@src').extract()
for frame in frames:
yield scrapy.Request(url=frame, callback=self.parse_frame, meta=response.meta)
sec_link = frame.replace('1', '4')
yield scrapy.Request(url=sec_link, callback=self.parse_sec_frame, meta={'NAME': NAME})
def parse_sec_frame(self, response):
url = response.xpath('//div[@id="picasa"]/iframe/@src').extract_first()
yield scrapy.Request(url=url, callback=self.parse_frame, meta={'NAME': response.meta["NAME"], 'url': url})
def parse_frame(self, response):
name = response.meta['NAME']
url = response.xpath('//div[@id="picasa"]/iframe/@src').extract_first()
sec_url = response.meta['url']
yield {
'Name': name,
'Url': url,
'sec_link': sec_url,
}