Python Scrapy在'crawler.configure()'而不是'reactor.run()处停止`
以下是启动爬虫程序的代码:Python Scrapy在'crawler.configure()'而不是'reactor.run()处停止`,python,web-scraping,scrapy,twisted,Python,Web Scraping,Scrapy,Twisted,以下是启动爬虫程序的代码: def start_crawler(pipe_connection): spider = RssSpider() crawler = Crawler(Settings()) crawler.configure() # Script stops here crawler.crawl(spider) # This line is never reached crawler.start() log.start()
def start_crawler(pipe_connection):
spider = RssSpider()
crawler = Crawler(Settings())
crawler.configure() # Script stops here
crawler.crawl(spider) # This line is never reached
crawler.start()
log.start()
reactor.run() # Script should stop here
def setup():
process, pipe_end = start_crawler_process()
link_pipe_end_to_event(pipe_end, 'new_rss_feed')
return process
def link_pipe_end_to_event(pipe_end, event_name):
def thread_func():
while True:
item = pipe_end.recv()
announce_item(item, event_name)
thread = Thread(target=thread_func)
thread.start()
def start_crawler_process():
parent_conn, child_conn = Pipe()
p = Process(target=start_crawler, args=(child_conn,))
p.start()
return p, parent_conn
def start_crawler(pipe_connection):
spider = RssSpider()
crawler = Crawler(Settings())
crawler.configure() # Script stops here
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
def announce_item(item, event_name):
pub.sendMessage(event_name, doc=item)
它几乎是本教程的直接副本:
没有提出任何例外。我知道脚本停止的唯一原因是使用print语句进行bug测试
以下是RssSpider的定义(rss_spider.py):
下面是调用start\u crawler的代码:
def start_crawler(pipe_connection):
spider = RssSpider()
crawler = Crawler(Settings())
crawler.configure() # Script stops here
crawler.crawl(spider) # This line is never reached
crawler.start()
log.start()
reactor.run() # Script should stop here
def setup():
process, pipe_end = start_crawler_process()
link_pipe_end_to_event(pipe_end, 'new_rss_feed')
return process
def link_pipe_end_to_event(pipe_end, event_name):
def thread_func():
while True:
item = pipe_end.recv()
announce_item(item, event_name)
thread = Thread(target=thread_func)
thread.start()
def start_crawler_process():
parent_conn, child_conn = Pipe()
p = Process(target=start_crawler, args=(child_conn,))
p.start()
return p, parent_conn
def start_crawler(pipe_connection):
spider = RssSpider()
crawler = Crawler(Settings())
crawler.configure() # Script stops here
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
def announce_item(item, event_name):
pub.sendMessage(event_name, doc=item)
设置.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'
CONCURRENT_REQUESTS = 100
LOG_LEVEL = 'DEBUG'
COOKIES_ENABLED = False
RETRY_ENABLED = False
DOWNLOAD_TIMEOUT = 15
DEPTH_LIMIT = 50
您粘贴的文档链接指向一个非常旧的Scrapy版本。你用的是什么版本?最新的文档位于:@elias Nice catch。但代码基本相同。我认为问题在于我试图将其作为一个单独的过程来启动,但我不知道为什么这是一个问题。