Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/279.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python scrapy限制域的请求量_Python_Scrapy_Generator - Fatal编程技术网

Python scrapy限制域的请求量

Python scrapy限制域的请求量,python,scrapy,generator,Python,Scrapy,Generator,目前我的limitLIMIT_的_域并没有像我想要的那样工作,不知怎么的,它在我的100个限制之后继续爬行。我已经搜索了刮擦文档,但找不到域的限制,我如何解决这个问题 理想情况下,我希望每个链接在启动_请求时获得100个请求 import scrapy from urllib.parse import urljoin class MyCrawlSpider(scrapy.Spider): LIMIT_OF_DOMAIN = 100 def start_requests(se

目前我的limitLIMIT_的_域并没有像我想要的那样工作,不知怎么的,它在我的100个限制之后继续爬行。我已经搜索了刮擦文档,但找不到域的限制,我如何解决这个问题

理想情况下,我希望每个链接在启动_请求时获得100个请求

import scrapy
from urllib.parse import urljoin


class MyCrawlSpider(scrapy.Spider):
    LIMIT_OF_DOMAIN = 100

    def start_requests(self):
        with open('links.txt', 'r') as f:
            urls = f.read().split('\n')
        #urls=['https://www.google.com/search?q=hello+world','https://yahoo.com']
        for url in urls:
            # remove empty links
            if not url:
                continue
            sub_item = {'main_url': url, 'index_of_domain': 0}
            yield scrapy.Request(url, callback=self.parse, meta=sub_item,)

    def parse(self,response):
        main_url = response.meta['main_url']
        index_of_domain = response.meta['index_of_domain']
        for url in response.xpath('//a[@href] | //article[@href]'):
            href = url.xpath('@href').extract_first()
            url = urljoin(main_url, href.strip())
            if index_of_domain >= self.LIMIT_OF_DOMAIN:
                break
            index_of_domain += 1
            sub_item = {'main_url': main_url, 'index_of_domain': index_of_domain}
            yield scrapy.Request(url, callback=self.parse, meta=sub_item, )
您只需要另一个类来存储每个起始链接的值

class CurrentValue(object):
    def __init__(self):
        self.value = 0

    def increment(self):
        self.value += 1
class YourSpider(scrapy.Spider):
    def start_requets(self):
        obj = CurrentValue()
        #your code
        sub_item = {'current_url': url, 'main_url': url, 'index_of_domain': obj}

    def parse(self,response):
        obj = response.meta['index_of_domain']
        for url in urls:
            obj.increment()