Python 一段时间后，刮擦超时_Python_Scrapy_Timeout_Web Crawler

Python 一段时间后，刮擦超时

python scrapy web-crawler

Python 一段时间后，刮擦超时,python,scrapy,timeout,web-crawler,Python,Scrapy,Timeout,Web Crawler,我正在从中抓取文本，这是我的问题我的蜘蛛一开始工作得几乎完美，爬了大约4000页 2018-09-28 20:05:00 [scrapy.extensions.logstats] INFO: Crawled 4161 pages (at 0 pages/min), scraped 0 items (at 0 items/min) 然后它开始从几乎所有的URL引发大量的TimeoutError，就像这样 2018-09-28 20:06:06 [scrapy.core.scraper] ERR

我正在从中抓取文本，这是我的问题

我的蜘蛛一开始工作得几乎完美，爬了大约4000页

2018-09-28 20:05:00 [scrapy.extensions.logstats] INFO: Crawled 4161 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

然后它开始从几乎所有的URL引发大量的TimeoutError，就像这样

2018-09-28 20:06:06 [scrapy.core.scraper] ERROR: Error downloading <GET https://www.dailynews.co.th/tags/When%20Will%20You%20Marry>
Traceback (most recent call last):
File "/usr/local/app/.local/share/virtualenvs/monolingual-6kEg5ui2/lib/python2.7/site-packages/Twisted-18.7.0-py2.7-linux-x86_64.egg/twisted/internet/defer.py", line 1416, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "/usr/local/app/.local/share/virtualenvs/monolingual-6kEg5ui2/lib/python2.7/site-packages/Twisted-18.7.0-py2.7-linux-x86_64.egg/twisted/python/failure.py", line 491, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/usr/local/app/.local/share/virtualenvs/monolingual-6kEg5ui2/lib/python2.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "/usr/local/app/.local/share/virtualenvs/monolingual-6kEg5ui2/lib/python2.7/site-packages/Twisted-18.7.0-py2.7-linux-x86_64.egg/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/app/.local/share/virtualenvs/monolingual-6kEg5ui2/lib/python2.7/site-packages/scrapy/core/downloader/handlers/http11.py", line 351, in _cb_timeout
raise TimeoutError("Getting %s took longer than %s seconds." % (url, timeout))
TimeoutError: User timeout caused connection failure: Getting https://www.dailynews.co.th/tags/When%20Will%20You%20Marry took longer than 5.0 seconds..

这是我的蜘蛛代码

# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from parse_config import parse_config


class ProtocolSpider(CrawlSpider):
    name = 'protocol'
    start_urls = ['https://www.dailynews.co.th,']
    custom_settings = {
        'JOBDIR': 'crawl_job'
    }

    def __init__(self, **kwargs):
        super(ProtocolSpider, self).__init__(**kwargs)
        self.arg_dict = parse_config(kwargs)
        self.start_urls = self.arg_dict['start_urls']
        # print self.start_urls
        self.allowed_domains = self.arg_dict['allowed_domains']
        self.output_file = open(self.arg_dict['output_file'], 'ab')
        self.rules = (
            Rule(LinkExtractor(allow=self.arg_dict['allow_url'], deny=self.arg_dict['deny_url']),
                 callback="parse_all", follow=True),
        )
        self._compile_rules()
        self.use_web_proxy = self.arg_dict['use_web_proxy']

    def parse(self, response):
        self.parse_all(response)
        return super(ProtocolSpider, self).parse(response)

    def parse_all(self, response):
        self._record_url(response)
        self._extract_all_p(response)
        self._extract_all_div(response)

    def _record_url(self, response):
        self.output_file.write('url_marker: %s' % response.url + '\n')

    def _extract_all_p(self, response):
        if self.arg_dict['extract_all_p']:
            p_ls = response.xpath('//p/text()').extract()
            p_string = '\n'.join([p.strip().encode('utf8') for p in p_ls if p.strip()])
            self.output_file.write(p_string + '\n')

    def _extract_all_div(self, response):
        if self.arg_dict['extract_all_div']:
            div_ls = response.xpath('//div/text()').extract()
            div_string = '\n'.join([div.strip().encode('utf8') for div in div_ls if div.strip()])
            self.output_file.write(div_string + '\n')

    def close(self, spider, reason):
        self.output_file.close()
        return super(ProtocolSpider, self).close(spider, reason)

好吧，如果我是那个网站的管理员，看到这样的事情：“32到16个并发请求”，我会永久禁止你。你可以试着通过代理。。。但真正精明的网络管理员会每隔几秒钟检测一次自动请求，并毫无疑问或第二次机会禁止（我是根据经验说话的）@meissner_uu谢谢你的建议。但正如我们所看到的，我已经将每个服务器的并发性设置为4，并启用了autothrottle，实际请求速率远低于理论值。事实上，我每分钟只有大约10页，我相信这对于一个网站爬虫来说是相当“温和”的。顺便说一句，我不认为这个问题是被禁止造成的，因为我仍然可以通过

scrapy shell

访问超时url。事实上，每分钟10次似乎是一个相当温和的速度，而且由于它是从shell中工作的，所以禁止似乎不太可能。。。另一方面：你永远不知道。

# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from parse_config import parse_config


class ProtocolSpider(CrawlSpider):
    name = 'protocol'
    start_urls = ['https://www.dailynews.co.th,']
    custom_settings = {
        'JOBDIR': 'crawl_job'
    }

    def __init__(self, **kwargs):
        super(ProtocolSpider, self).__init__(**kwargs)
        self.arg_dict = parse_config(kwargs)
        self.start_urls = self.arg_dict['start_urls']
        # print self.start_urls
        self.allowed_domains = self.arg_dict['allowed_domains']
        self.output_file = open(self.arg_dict['output_file'], 'ab')
        self.rules = (
            Rule(LinkExtractor(allow=self.arg_dict['allow_url'], deny=self.arg_dict['deny_url']),
                 callback="parse_all", follow=True),
        )
        self._compile_rules()
        self.use_web_proxy = self.arg_dict['use_web_proxy']

    def parse(self, response):
        self.parse_all(response)
        return super(ProtocolSpider, self).parse(response)

    def parse_all(self, response):
        self._record_url(response)
        self._extract_all_p(response)
        self._extract_all_div(response)

    def _record_url(self, response):
        self.output_file.write('url_marker: %s' % response.url + '\n')

    def _extract_all_p(self, response):
        if self.arg_dict['extract_all_p']:
            p_ls = response.xpath('//p/text()').extract()
            p_string = '\n'.join([p.strip().encode('utf8') for p in p_ls if p.strip()])
            self.output_file.write(p_string + '\n')

    def _extract_all_div(self, response):
        if self.arg_dict['extract_all_div']:
            div_ls = response.xpath('//div/text()').extract()
            div_string = '\n'.join([div.strip().encode('utf8') for div in div_ls if div.strip()])
            self.output_file.write(div_string + '\n')

    def close(self, spider, reason):
        self.output_file.close()
        return super(ProtocolSpider, self).close(spider, reason)