Python scrapy请求不产生任何输出
我正在尝试将“以下链接”示例改编为我自己的spider:Python scrapy请求不产生任何输出,python,scrapy,Python,Scrapy,我正在尝试将“以下链接”示例改编为我自己的spider: import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from funda.items import FundaItem class PropertyLinksSpider(CrawlSpider): name = "property_links" allowe
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from funda.items import FundaItem
class PropertyLinksSpider(CrawlSpider):
name = "property_links"
allowed_domains = ["funda.nl"]
def __init__(self, place='amsterdam', page='1'):
self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
self.base_url = "http://www.funda.nl/koop/%s/" % place
self.le1 = LinkExtractor(allow=r'%s+huis|appartement-\d{8}' % self.base_url)
def parse(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
yield scrapy.Request(link.url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item['title'] = response.xpath('//title').extract()
yield item
但是,如果我尝试使用以下命令运行此命令
scrapy crawl property_links -a place=amsterdam -a page=1 -o property_links_test.json
我得到一个空的.json文件:
[
在这个爬行器的早期版本中,我使用了
parse
方法,只需yield item
爬行器就会生成一个带有预期链接的.json文件。我还使用Scrapy shell检查了页面是否有标题。所以我不明白为什么我没有得到任何输出?您没有将项目解析为第二个函数这段代码对我来说很好
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class FundaItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
class PropertyLinksSpider(CrawlSpider):
name = "property_links"
allowed_domains = ["funda.nl"]
def __init__(self, place='amsterdam', page='1'):
self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
self.base_url = "http://www.funda.nl/koop/%s/" % place
self.le1 = LinkExtractor(allow=r'%s+huis|appartement-\d{8}' % self.base_url)
def parse(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
yield scrapy.Request(link.url, callback=self.parse_dir_contents, meta={'item': item})
def parse_dir_contents(self, response):
new_item = response.request.meta['item']
new_item['title'] = response.xpath('//title').extract()
yield new_item
您没有将项解析为第二个函数。这段代码对我来说很好
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class FundaItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
class PropertyLinksSpider(CrawlSpider):
name = "property_links"
allowed_domains = ["funda.nl"]
def __init__(self, place='amsterdam', page='1'):
self.start_urls = ["http://www.funda.nl/koop/%s/p%s/" % (place, page)]
self.base_url = "http://www.funda.nl/koop/%s/" % place
self.le1 = LinkExtractor(allow=r'%s+huis|appartement-\d{8}' % self.base_url)
def parse(self, response):
links = self.le1.extract_links(response)
for link in links:
if link.url.count('/') == 6 and link.url.endswith('/'):
item = FundaItem()
item['url'] = link.url
yield scrapy.Request(link.url, callback=self.parse_dir_contents, meta={'item': item})
def parse_dir_contents(self, response):
new_item = response.request.meta['item']
new_item['title'] = response.xpath('//title').extract()
yield new_item