Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/315.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 为什么Scrapy扩展类的成员变量在spider中不可见,但可以通过管道访问?_Python_Web Scraping_Scrapy_Scrapy Spider - Fatal编程技术网

Python 为什么Scrapy扩展类的成员变量在spider中不可见,但可以通过管道访问?

Python 为什么Scrapy扩展类的成员变量在spider中不可见,但可以通过管道访问?,python,web-scraping,scrapy,scrapy-spider,Python,Web Scraping,Scrapy,Scrapy Spider,我在scrapy中创建了扩展来设置公共路径变量和其他东西。因此,如果输出路径发生更改,只需修改一个文件。但我无法进入蜘蛛体内的路径 以下是扩展代码 import datetime,re,os,random from scrapy import signals from scrapy.spider import Spider from scrapy.conf import settings class Common(object): output_dir = '' @class

我在scrapy中创建了扩展来设置公共路径变量和其他东西。因此,如果输出路径发生更改,只需修改一个文件。但我无法进入蜘蛛体内的路径

以下是扩展代码

import datetime,re,os,random
from scrapy import signals
from scrapy.spider import Spider
from scrapy.conf import settings

class Common(object):
    output_dir = ''

    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings

        if settings['DATE']:
            cls.output_dir = 'output/' + settings['DATE'] + '/'
        else:
            cls.output_dir = 'output/' + datetime.date.today().strftime('%Y-%m-%d') + '/'
在以下设置中启用上述扩展

EXTENSIONS = {'scrapyproject.common.Common':500,}
from scrapyproject.spiderCommon import *

class dmozSpider(CrawlSpider):
    name = 'dmozSpider'
    allowed_domains = ['www.dmoz.org']
    start_urls = ['http://www.dmoz.org']

    rules = (
        Rule(SgmlLinkExtractor(allow=(),), callback='parse_item', follow=True),
    )

    def __init__(self, *a, **kw):
        super(dmozSpider, self).__init__(self, *a, **kw)
        dispatcher.connect(self.my_spider_opened, signals.spider_opened)

    def parse_item(self, response):
        sel = Selector(response)

        vifUrls = sel.xpath('//ul[@class="directory dir-col"]/li/a/@href').extract()
        with open(Common.output_dir + self.name + '.csv', 'a') as f:
            for vifUrl in vifUrls:
                print vifUrl
                f.write("%s\n" % vifUrl)
        pass

    def my_spider_opened(self, spider):
        fo = open(Common.output_dir + self.name + '.csv', "w+")
        fo.truncate()
        fo.close()
我的蜘蛛代码如下

EXTENSIONS = {'scrapyproject.common.Common':500,}
from scrapyproject.spiderCommon import *

class dmozSpider(CrawlSpider):
    name = 'dmozSpider'
    allowed_domains = ['www.dmoz.org']
    start_urls = ['http://www.dmoz.org']

    rules = (
        Rule(SgmlLinkExtractor(allow=(),), callback='parse_item', follow=True),
    )

    def __init__(self, *a, **kw):
        super(dmozSpider, self).__init__(self, *a, **kw)
        dispatcher.connect(self.my_spider_opened, signals.spider_opened)

    def parse_item(self, response):
        sel = Selector(response)

        vifUrls = sel.xpath('//ul[@class="directory dir-col"]/li/a/@href').extract()
        with open(Common.output_dir + self.name + '.csv', 'a') as f:
            for vifUrl in vifUrls:
                print vifUrl
                f.write("%s\n" % vifUrl)
        pass

    def my_spider_opened(self, spider):
        fo = open(Common.output_dir + self.name + '.csv', "w+")
        fo.truncate()
        fo.close()
其中spiderCommon文件包含以下内容

from scrapyproject.common import *
from scrapy.selector import Selector
from scrapy.xlib.pydispatch import dispatcher
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
Common.output_dir的值在spider中不可访问,但我可以在管道中访问它

from scrapyproject.common import *

class XmlExportPipeline(object):
    def __init__(self, **kwargs):
        self.file_count = 1

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        print Common.output_dir

    def spider_closed(self, spider):
        self.file_count = self.file_count + 1

    def process_item(self, item, spider):
        return item
当我尝试在spider上运行时,它卡在0.0.0.0:6080上的调试:Web服务上,然后在不爬网任何链接的情况下完成。原因是它没有得到Common.output\u dir的值 谁能告诉我哪里出了问题