Python 刮伤-意外的“未受伤”;“收益率”;命令
我有一个愚蠢的问题,不让我管理蜘蛛。每次运行爬行器时,我都会在爬行器代码末尾的“yield”之后得到最后一个“}”的Python 刮伤-意外的“未受伤”;“收益率”;命令,python,web-scraping,scrapy,Python,Web Scraping,Scrapy,我有一个愚蠢的问题,不让我管理蜘蛛。每次运行爬行器时,我都会在爬行器代码末尾的“yield”之后得到最后一个“}”的IndentationError,我无法找到原因。有人能帮我解决这个问题吗?非常感谢 这是我的蜘蛛: --编码:utf-8-- import scrapy 导入json 导入日志记录 导入URL解析 ArtsPodcastsSpider类(刮毛蜘蛛): 名称='arts_播客' 允许的_域=['www.castbox.fm'] def start_请求(自我): 尝试: if re
IndentationError
,我无法找到原因。有人能帮我解决这个问题吗?非常感谢
这是我的蜘蛛:
--编码:utf-8--
import scrapy
导入json
导入日志记录
导入URL解析
ArtsPodcastsSpider类(刮毛蜘蛛):
名称='arts_播客'
允许的_域=['www.castbox.fm']
def start_请求(自我):
尝试:
if response.request.meta['skip']:
skip=response.request.meta['skip']
其他:
跳过=0
跳过<201时:
url='1〕https://everest.castbox.fm/data/top_channels/v2?category_id=10021&country=us&skip=0&limit=60&web=1&m=20201112&n=609584ea96edb64605bca96212128aa5&r=1'
split_url=urlparse.urlspit(url)
path=split\u url.path
path.split(“&”)
path.split('&')[:-5]
'&'.join(path.split('&')[:-5])
parsed_query=urlparse.parse_qs(split_url.query)
query=urlparse.parseqs(split_url.query,keep_blank_values=True)
查询['skip']=跳过
更新=split\u url.\u replace(path='&'.join(base\u path.split('&')[:-5]+['limit=60&web=1&m=20201112&n=609584EA96EDB64605BCA962128AA5&r=1',''),
query=urllib.urlencode(查询,doseq=True))
更新的\u url=urlparse.urlusplit(更新)
产生scrapy.Request(url=updated\uURL,callback=self.parse\uID,meta={'skip':skip})
def parse_id(自身,响应):
skip=response.request.meta['skip']
data=json.load(response.body)
category=data.get('data').get('category').get('name'))
arts_podcasts=data.get('data').get('list'))
对于arts_播客中的arts_播客:
生成剪贴请求(url=)https://everest.castbox.fm/data/top_channels/v2?category_id=10021&country=us&skip={0}&limit=60&web=1&m=20201111&n=609ba0097bb48d4b0778a927bdcf69f4&r=1'。格式(arts_podcast.get('list')[2]。get('cid')),meta={'category':category,'skip',callback=self.parse)
def解析(自我,响应):
skip=response.request.meta['skip']
category=response.request.meta['category']
arts_podcast=json.loads(response.body.get('data'))
产生scrapy.Request(callback=self.start\u请求,meta={'skip':skip+1})
屈服{
“标题”:arts_podcast.get('title'),
“类别”:arts_podcast.get('category'),
“子类别”:艺术播客.get('categories')
}
谢谢大家! 错误是在没有匹配的
的情况下进行尝试
,除了
或最终
我希望这会导致SyntaxError
,但我猜python会检测到您回到try
语句的原始缩进,然后才发现除了最后的/之外没有匹配的
还有其他错误,例如访问start\u请求中的unexpistantresponse
,以及解析方法的缩进错误
import scrapy
import json
import logging
import urlparse
class ArtsPodcastsSpider(scrapy.Spider):
name = 'arts_podcasts'
allowed_domains = ['www.castbox.fm']
def start_requests(self):
try:
if response.request.meta['skip']:
skip=response.request.meta['skip']
else:
skip=0
while skip < 201:
url = 'https://everest.castbox.fm/data/top_channels/v2?category_id=10021&country=us&skip=0&limit=60&web=1&m=20201112&n=609584ea96edb64605bca96212128aa5&r=1'
split_url = urlparse.urlsplit(url)
path = split_url.path
path.split('&')
path.split('&')[:-5]
'&'.join(path.split('&')[:-5])
parsed_query = urlparse.parse_qs(split_url.query)
query = urlparse.parse_qs(split_url.query, keep_blank_values=True)
query['skip'] = skip
updated = split_url._replace(path='&'.join(base_path.split('&')[:-5]+['limit=60&web=1&m=20201112&n=609584ea96edb64605bca96212128aa5&r=1', '']),
query=urllib.urlencode(query, doseq=True))
updated_url=urlparse.urlunsplit(updated)
yield scrapy.Request(url= updated_url, callback= self.parse_id, meta={'skip':skip})
def parse_id(self, response):
skip=response.request.meta['skip']
data=json.loads(response.body)
category=data.get('data').get('category').get('name')
arts_podcasts=data.get('data').get('list')
for arts_podcast in arts_podcasts:
yield scrapy.Request(url='https://everest.castbox.fm/data/top_channels/v2?category_id=10021&country=us&skip={0}&limit=60&web=1&m=20201111&n=609ba0097bb48d4b0778a927bdcf69f4&r=1'.format(arts_podcast.get('list')[2].get('cid')), meta={'category':category,'skip':skip}, callback= self.parse)
def parse(self, response):
skip=response.request.meta['skip']
category=response.request.meta['category']
arts_podcast=json.loads(response.body).get('data')
yield scrapy.Request(callback=self.start_requests,meta={'skip':skip+1})
yield{
'title':arts_podcast.get('title'),
'category':arts_podcast.get('category'),
'sub_category':arts_podcast.get('categories')
}