Python Scrapy在'crawler.configure()'而不是'reactor.run()处停止`

Python Scrapy在'crawler.configure()'而不是'reactor.run()处停止`,python,web-scraping,scrapy,twisted,Python,Web Scraping,Scrapy,Twisted,以下是启动爬虫程序的代码: def start_crawler(pipe_connection): spider = RssSpider() crawler = Crawler(Settings()) crawler.configure() # Script stops here crawler.crawl(spider) # This line is never reached crawler.start() log.start()

以下是启动爬虫程序的代码:

def start_crawler(pipe_connection):
    spider = RssSpider()
    crawler = Crawler(Settings())
    crawler.configure()  # Script stops here
    crawler.crawl(spider)  # This line is never reached
    crawler.start()
    log.start()
    reactor.run()  # Script should stop here
def setup():
    process, pipe_end = start_crawler_process()
    link_pipe_end_to_event(pipe_end, 'new_rss_feed')
    return process


def link_pipe_end_to_event(pipe_end, event_name):
    def thread_func():
        while True:
            item = pipe_end.recv()
            announce_item(item, event_name)
    thread = Thread(target=thread_func)
    thread.start()


def start_crawler_process():
    parent_conn, child_conn = Pipe()
    p = Process(target=start_crawler, args=(child_conn,))
    p.start()
    return p, parent_conn


def start_crawler(pipe_connection):
    spider = RssSpider()
    crawler = Crawler(Settings())
    crawler.configure()  # Script stops here
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()


def announce_item(item, event_name):
    pub.sendMessage(event_name, doc=item)
它几乎是本教程的直接副本:

没有提出任何例外。我知道脚本停止的唯一原因是使用print语句进行bug测试

以下是RssSpider的定义(rss_spider.py):

下面是调用start\u crawler的代码:

def start_crawler(pipe_connection):
    spider = RssSpider()
    crawler = Crawler(Settings())
    crawler.configure()  # Script stops here
    crawler.crawl(spider)  # This line is never reached
    crawler.start()
    log.start()
    reactor.run()  # Script should stop here
def setup():
    process, pipe_end = start_crawler_process()
    link_pipe_end_to_event(pipe_end, 'new_rss_feed')
    return process


def link_pipe_end_to_event(pipe_end, event_name):
    def thread_func():
        while True:
            item = pipe_end.recv()
            announce_item(item, event_name)
    thread = Thread(target=thread_func)
    thread.start()


def start_crawler_process():
    parent_conn, child_conn = Pipe()
    p = Process(target=start_crawler, args=(child_conn,))
    p.start()
    return p, parent_conn


def start_crawler(pipe_connection):
    spider = RssSpider()
    crawler = Crawler(Settings())
    crawler.configure()  # Script stops here
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()


def announce_item(item, event_name):
    pub.sendMessage(event_name, doc=item)
设置.py

BOT_NAME = 'crawler'

SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'

CONCURRENT_REQUESTS = 100
LOG_LEVEL = 'DEBUG'
COOKIES_ENABLED = False
RETRY_ENABLED = False
DOWNLOAD_TIMEOUT = 15
DEPTH_LIMIT = 50

您粘贴的文档链接指向一个非常旧的Scrapy版本。你用的是什么版本?最新的文档位于:@elias Nice catch。但代码基本相同。我认为问题在于我试图将其作为一个单独的过程来启动,但我不知道为什么这是一个问题。