Exception 残缺的未处理异常
我在linux上使用的是scrapy 0.16.2版本。我在跑步:Exception 残缺的未处理异常,exception,scrapy,unhandled,Exception,Scrapy,Unhandled,我在linux上使用的是scrapy 0.16.2版本。我在跑步: scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider 我遇到了这个错误,它会阻止scrapy(挂起并且不会自动完成,只有^C会停止) 2012-11-20 15:04:51+0000[-]未处理的错误回溯(最近一次调用):文件“/usr/lib/python2.7/site packages/scrapy/commands/crawl.py”,第45行,正在运行 sel
scrapy crawl mycrawlspider -s JOBDIR=/mnt/mycrawlspider
我遇到了这个错误,它会阻止scrapy(挂起并且不会自动完成,只有^C会停止)
2012-11-20 15:04:51+0000[-]未处理的错误回溯(最近一次调用):文件“/usr/lib/python2.7/site packages/scrapy/commands/crawl.py”,第45行,正在运行
self.crawler.start()文件“/usr/lib/python2.7/site packages/scrapy/crawler.py”,第80行,开始
reactor.run(installSignalHandlers=False)#阻塞调用文件“/usr/lib/python2.7/site packages/twisted/internet/base.py”,第1169行,运行中
mainLoop()文件“/usr/lib/python2.7/site packages/twisted/internet/base.py”,第1178行,在mainLoop中
self.rununtlcurrent()----文件“/usr/lib/python2.7/site packages/twisted/internet/base.py”,第800行,在rununtlcurrent中
call.func(*call.args,**call.kw)文件“/usr/lib/python2.7/site packages/scrapy/utils/reactor.py”,第41行,在调用中__
返回self.\u func(*self.\u a,**self.\u kw)文件“/usr/lib/python2.7/site packages/scrapy/core/engine.py”,第116行,in
_下一个请求
爬网(request,spider)文件“/usr/lib/python2.7/site packages/scrapy/core/engine.py”,爬网中的第172行
self.schedule(request,spider)文件“/usr/lib/python2.7/site packages/scrapy/core/engine.py”,计划中第176行
返回self.slots[spider].scheduler.enqueue_请求(请求)文件“/usr/lib/python2.7/site packages/scrapy/core/scheduler.py”,第48行,在enqueue_请求中
if not request.dont_filter和self.df.request_seed(请求):exceptions.AttributeError:'NoneType'对象没有属性'dont_filter'
顺便说一句,这在0.14版中有效
代码如下:
class MySpider(CrawlSpider):
name = 'alrroya'
NEW_IGNORED_EXTENSIONS = list(IGNORED_EXTENSIONS)
NEW_IGNORED_EXTENSIONS.remove('pdf')
download_delay = 0.05
# Stay within these domains when crawling
allowed_domains = []
all_domains = {}
start_urls = []
# Add our callback which will be called for every found link
rules = [
Rule(SgmlLinkExtractor(deny_extensions=NEW_IGNORED_EXTENSIONS, tags=('a', 'area', 'frame', 'iframe'), attrs=('href', 'src')), follow=True, callback='parse_crawled_page')
]
# How many pages crawled
crawl_count = 0
# How many PDFs we have found
pdf_count = 0
def __init__(self, *args, **kwargs):
CrawlSpider.__init__(self, *args, **kwargs)
dispatcher.connect(self._spider_closed, signals.spider_closed)
dispatcher.connect(self._spider_opened, signals.spider_opened)
self.load_allowed_domains_and_start_urls()
def allowed_to_start(self):
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = self.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
reason = True
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
reason = True
else:
reason = False
else:
reason = True
return reason
def _spider_opened(self, spider):
if spider is not self:
return
curr_date = datetime.today()
curr_date = datetime(curr_date.year, curr_date.month, curr_date.day)
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
day = timedelta(days=1)
if os.path.exists(checkfile):
f = open(checkfile, 'r')
data = f.read()
f.close()
data = data.split('\n')
reason = data[0]
try:
reason_date = datetime.strptime(data[1], '%Y-%m-%d')
except Exception as ex:
reason_date = None
if reason_date and 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
if reason_date and reason_date + day <= curr_date and 'finished' in reason:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
else:
crawler.engine.close_spider(self, 'finished')
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
os._exit(1)
else:
f = open(checkfile, 'w')
f.write('started\n')
f.write(str(date.today()))
f.close()
def _spider_closed(self, spider, reason):
if spider is not self:
return
jobdir = spider.settings['JOBDIR']
if jobdir:
mnt = os.path.dirname(os.path.normpath(jobdir))
else:
mnt = ''
checkfile = os.path.join(mnt, '%s.crawlercheck' % self.__class__.name)
if 'shutdown' in reason:
f = open(checkfile, 'w')
f.write('shutdown\n')
f.write(str(date.today()))
f.close()
else:
if jobdir and os.path.exists(jobdir):
shutil.rmtree(jobdir)
f = open(checkfile, 'w')
f.write('finished\n')
f.write(str(date.today()))
f.close()
def _requests_to_follow(self, response):
if getattr(response, 'encoding', None) != None:
return CrawlSpider._requests_to_follow(self, response)
else:
return []
def make_requests_from_url(self, url):
http_client = httplib2.Http()
try:
headers = {
'content-type': 'text/html',
'user-agent': random.choice(USER_AGENT_LIST)
}
response, content = http_client.request(url, method='HEAD', headers=headers)
#~ if 'pdf' in response['content-type'].lower() or (url.endswith('.pdf') and 'octet-stream' in response['content-type'].lower()):
if 'pdf' in response['content-type'].lower() or 'octet-stream' in response['content-type'].lower():
if self.allowed_to_start():
self.get_pdf_link(url)
else:
return CrawlSpider.make_requests_from_url(self, url)
except Exception as ex:
return CrawlSpider.make_requests_from_url(self, url)
def get_pdf_link(self, url):
source = self.__class__.name
parsed_url = urlparse(url)
url_domain = parsed_url.netloc
url_path = parsed_url.path
if url_domain:
for domain, paths in self.__class__.all_domains[source]['allow_domains'].iteritems():
if url_domain.endswith(domain):
pre_and = False
pre_or = False
and_cond = True
or_cond = False
for path in paths:
if path[0:1] == '!':
pre_and = True
if path[1:] not in url_path:
and_cond = and_cond and True
else:
and_cond = and_cond and False
else:
pre_or = True
if path in url_path:
or_cond = or_cond or True
else:
or_cond = or_cond or False
if pre_and and pre_or:
if and_cond and or_cond:
self.pdf_process(source, url)
return
elif pre_and:
if and_cond:
self.pdf_process(source, url)
return
elif pre_or:
if or_cond:
self.pdf_process(source, url)
return
else:
self.pdf_process(source, url)
return
def parse_crawled_page(self, response):
self.__class__.crawl_count += 1
crawl_count = self.__class__.crawl_count
if crawl_count % 100 == 0:
print 'Crawled %d pages' % crawl_count
if 'pdf' in response.headers.get('content-type', '').lower():
self.get_pdf_link(response.url)
return Item()
def load_allowed_domains_and_start_urls(self):
day = timedelta(days=1)
currdate = date.today()
alrroya = ('http://epaper.alrroya.com/currentissues.php?editiondt=' + currdate.strftime('%Y/%m/%d'),)
self.__class__.all_domains = {
'alrroya': {
'start_urls': alrroya,
'allow_domains': {
'epaper.alrroya.com': frozenset(()),
}
}
}
for domain in self.__class__.all_domains[self.__class__.name]['allow_domains']:
self.__class__.allowed_domains.append(domain)
self.__class__.start_urls.extend(self.__class__.all_domains[self.__class__.name]['start_urls'])
def pdf_process(self, source, url):
print '!!! ' + source + ' ' + url
class MySpider(爬行蜘蛛):
名称='alrroya'
新的\u忽略的\u扩展=列表(忽略的\u扩展)
新的\u已忽略\u扩展。删除('pdf')
下载延迟=0.05
#爬网时请保持在这些域内
允许的_域=[]
所有_域={}
起始URL=[]
#添加我们的回调,它将为每个找到的链接调用
规则=[
规则(SGMLLinkedExtractor(拒绝扩展=新建扩展,忽略扩展,标记=('a'、'area'、'frame'、'iframe')、属性=('href'、'src')),follow=True,回调='parse\u crawled\u page')
]
#爬网了多少页
爬网计数=0
#我们找到了多少PDF
pdf_计数=0
定义初始化(self,*args,**kwargs):
爬行爬行器.uuu初始化(self,*args,**kwargs)
dispatcher.connect(self.\u spider\u关闭,signals.spider\u关闭)
dispatcher.connect(self.\u spider\u打开,signals.spider\u打开)
self.load\u允许的\u域\u和\u开始\u URL()
允许def启动(自):
curr_date=datetime.today()
当前日期=日期时间(当前日期.年,当前日期.月,当前日期.日)
jobdir=self.settings['jobdir']
如果是jobdir:
mnt=os.path.dirname(os.path.normpath(jobdir))
其他:
mnt=''
checkfile=os.path.join(mnt,“%s.crawlercheck”%self.\uuuuu class\uuuuuu.name)
天=时间增量(天=1)
如果os.path.exists(检查文件):
f=打开(检查文件“r”)
data=f.read()
f、 关闭()
data=data.split('\n')
原因=数据[0]
尝试:
原因\u date=datetime.strtime(数据[1],“%Y-%m-%d”)
例外情况除外,例如:
原因\日期=无
如果原因是日期和原因中的“关机”:
原因=正确
其他:
如果reason_date和reason_date+day这似乎是Scrapy中的一个bug。当前版本似乎不接受从\u url()发出的请求返回的列表。为了解决这个问题,我可以用下面的方法修改Scrapy代码
在文件Scrapy-0.16.5-py2.7.egg/Scrapy/spider.py中
更改:
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
致:
我希望官方的Scrapy人员最终会解决这个问题。从mycrawlspider
def start_requests(self):
for url in self.start_urls:
yield self.make_requests_from_url(url)
def start_requests(self):
for url in self.start_urls:
requests = self.make_requests_from_url(url)
if type(requests) is list:
for request in requests:
yield request
else:
yield requests