Python 碎片回溯302,索引错误列表
我试图刮取特定标签的文章,比如Python 2.7中的“机器学习”。我有以下代码:Python 碎片回溯302,索引错误列表,python,web-scraping,callback,scrapy,python-requests,Python,Web Scraping,Callback,Scrapy,Python Requests,我试图刮取特定标签的文章,比如Python 2.7中的“机器学习”。我有以下代码: import scrapy import codecs import json from datetime import datetime from datetime import timedelta import os def writeTofile(fileName,text): with codecs.open(fileName,'w','utf-8') as outfile: o
import scrapy
import codecs
import json
from datetime import datetime
from datetime import timedelta
import os
def writeTofile(fileName,text):
with codecs.open(fileName,'w','utf-8') as outfile:
outfile.write(text)
class MediumPost(scrapy.Spider):
name='medium_scraper'
handle_httpstatus_list = [401,400]
autothrottle_enabled=True
def start_requests(self):
start_urls = ['https://medium.com/tag/'+self.tagSlug.strip("'")+'/archive/']
print(start_urls)
#Header and cookie information can be got from the Network Tab in Developer Tools
cookie = {'mhj': 'd4c630604c57a104af8bc98218fb3430145',
'nj': '1',
'ko': '1:J0mnan1t5jlHypyliL8GAY1WNfDvtqZBgmBDr+7STp2QSwyWUz6',
'pi': '233',
'mt': '-874'}
header = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
startDate=datetime.strptime(self.start_date,"%Y%m%d")
endDate=datetime.strptime(self.end_date,"%Y%m%d")
delta=endDate-startDate
print(delta)
for i in range(delta.days + 1):
d=datetime.strftime(startDate+timedelta(days=i),'%Y/%m/%d')
for url in start_urls:
print(url+d)
yield scrapy.Request(url+d, method="GET",headers=header,cookies=cookie,callback=self.parse,meta={'reqDate':d})
def parse(self,response):
response_data=response.text
response_split=response_data.split("while(1);</x>")
response_data=response_split[1]
date_post=response.meta['reqDate']
date_post=date_post.replace("/","")
directory=datetime.now().strftime("%Y%m%d")
if not os.path.exists(directory):
os.makedirs(directory)
writeTofile(directory+"//"+self.tagSlug.replace("-","").strip("'")+"Tag"+date_post+".json",response_data)
当我尝试将def parse放在def start_请求之上时,我会得到缩进错误
由于我是一个初学者,我不知道错误在哪里?我认为您的编辑器中存在“MediumPost.parse callback未定义”问题。看起来python转换器没有看到函数“parse”。我想你混合了四个空格和表格。
我使用PyCharm。当然,我没有同样的问题。
经过一些改变后,它对我有用。我添加self.tagSlug、self.start\u日期、self.end\u日期。
我使用PEP-8建议编辑代码。现在看起来好多了。我去掉了“指纹”。最好在调试期间使用断点。
我将变量名移到Python类型。我记得PEP-8只向您推荐一种类型的名称(Python类型或Java类型)
import scrapy
导入编解码器
从日期时间导入日期时间
从日期时间导入时间增量
导入操作系统
def writeTofile(文件名,文本):
使用编解码器打开(文件名为“w”,“utf-8”)作为输出文件:
outfile.write(文本)
MediumPost类(刮毛蜘蛛):
name='medium\u scraper'
handle_httpstatus_list=[401400]
自动油门启用=真
tag_slug=‘机器学习’
开始日期='20170110'
结束日期='20181130'
def start_请求(自我):
起始URL=['https://medium.com/tag/'+self.tag_slug.strip(“”+”/archive/']
#标题和cookie信息可以从开发人员工具中的“网络”选项卡获得
cookie={'mhj':'d4c630604c57a104af8bc98218fb3430145',
“nj”:“1”,
‘ko’:‘1:J0mnan1t5jlHypyliL8GAY1WNfDvtqZBgmBDr+7STp2QSwyWUz6’,
"pi":"233",,
'mt':'-874'}
header={'user-agent':'Mozilla/5.0(X11;Linux x86_64)AppleWebKit/537.36(KHTML,类似Gecko)Chrome/65.0.3325.181 Safari/537.36'}
startDate=datetime.strTime(self.start\u日期,“%Y%m%d”)
endDate=datetime.strTime(self.end_日期,“%Y%m%d”)
增量=结束日期-开始日期
对于范围内的i(增量天数+1):
d=datetime.strftime(startDate+timedelta(days=i),“%Y/%m/%d”)
对于start_url中的url:
打印(url+d)
生成scrapy.Request(url+d,headers=header,cookies=cookie,meta={'req\u date':d})
def解析(自我,响应):
response\u data=response.text
response_split=response_data.split(“while(1);”)
响应\数据=响应\拆分[0]
date\u post=response.meta['req\u date']
日期发布=日期发布。替换(“/”,“”)
directory=datetime.now().strftime(“%Y%m%d”)
如果不存在os.path.exists(目录):
os.makedirs(目录)
writeTofile(目录+“/”+self.tag\u slug.replace(“-”,”).strip(“”+“tag”+date\u post+“.json”,响应数据)
scrapy.core.engine] DEBUG: Crawled (200) <GET https://medium.com/tag/machine-learning/archive/2015/07/13> (referer: None)
current.result = callback(current.result, *args, **kw)
File "/home/mkol/anaconda2/lib/python2.7/site-packages/scrapy/spiders/__init__.py", line 90, in parse
raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))
NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__))
import scrapy
import codecs
from datetime import datetime
from datetime import timedelta
import os
def writeTofile(file_name, text):
with codecs.open(file_name, 'w', 'utf-8') as outfile:
outfile.write(text)
class MediumPost(scrapy.Spider):
name='medium_scrapper'
handle_httpstatus_list = [401, 400]
autothrottle_enabled = True
tag_slug = 'machine-learning'
start_date = '20170110'
end_date = '20181130'
def start_requests(self):
start_urls = ['https://medium.com/tag/' + self.tag_slug.strip("'") + '/archive/']
#Header and cookie information can be got from the Network Tab in Developer Tools
cookie = {'mhj': 'd4c630604c57a104af8bc98218fb3430145',
'nj': '1',
'ko': '1:J0mnan1t5jlHypyliL8GAY1WNfDvtqZBgmBDr+7STp2QSwyWUz6',
'pi': '233',
'mt': '-874'}
header = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
startDate = datetime.strptime(self.start_date, "%Y%m%d")
endDate = datetime.strptime(self.end_date, "%Y%m%d")
delta = endDate - startDate
for i in range(delta.days + 1):
d = datetime.strftime(startDate + timedelta(days=i), '%Y/%m/%d')
for url in start_urls:
print(url + d)
yield scrapy.Request(url + d, headers=header, cookies=cookie, meta={'req_date': d})
def parse(self,response):
response_data = response.text
response_split = response_data.split("while(1);</x>")
response_data = response_split[0]
date_post = response.meta['req_date']
date_post = date_post.replace("/", "")
directory = datetime.now().strftime("%Y%m%d")
if not os.path.exists(directory):
os.makedirs(directory)
writeTofile(directory + "//" + self.tag_slug.replace("-", "").strip("'") + "Tag" + date_post + ".json", response_data)