Python 如何安排spider每5分钟运行一次?
几天来,我一直在想如何安排我的刮痧蜘蛛,但一点运气都没有。(我尝试了从Windows任务调度器到scrapy do lib的所有方法,但在我的MAIN.PY上没有任何效果) (我的主要目标是安排我的spider每隔5分钟从我的spiderNewsSpider收集数据到mySQLnews\u db数据库) 请看我的脚本,因为它有点修改,如果需要,请更改它。我真的希望这能奏效 MAIN.PYPython 如何安排spider每5分钟运行一次?,python,scrapy,scheduled-tasks,scrapy-pipeline,Python,Scrapy,Scheduled Tasks,Scrapy Pipeline,几天来,我一直在想如何安排我的刮痧蜘蛛,但一点运气都没有。(我尝试了从Windows任务调度器到scrapy do lib的所有方法,但在我的MAIN.PY上没有任何效果) (我的主要目标是安排我的spider每隔5分钟从我的spiderNewsSpider收集数据到mySQLnews\u db数据库) 请看我的脚本,因为它有点修改,如果需要,请更改它。我真的希望这能奏效 MAIN.PY from scrapy import cmdline cmdline.execute("scrapy cra
from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
pub_date = response.xpath('//pubDate/text()').extract()[0]
page_title = response.xpath('//title/text()').extract()[2]
page_summary = response.xpath('//description/text()').extract()[1]
text_link = response.xpath('//link/text()').extract()[2]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = page_title
item['summary'] = page_summary
item['link'] = text_link
yield item
import scrapy
class WebspiderItem(scrapy.Item):
# define the fields for your item here like:
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='passordpassord',
database='news_db'
)
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
新闻\u SPIDER.PY
from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
pub_date = response.xpath('//pubDate/text()').extract()[0]
page_title = response.xpath('//title/text()').extract()[2]
page_summary = response.xpath('//description/text()').extract()[1]
text_link = response.xpath('//link/text()').extract()[2]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = page_title
item['summary'] = page_summary
item['link'] = text_link
yield item
import scrapy
class WebspiderItem(scrapy.Item):
# define the fields for your item here like:
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='passordpassord',
database='news_db'
)
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
ITEMS.PY
from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
pub_date = response.xpath('//pubDate/text()').extract()[0]
page_title = response.xpath('//title/text()').extract()[2]
page_summary = response.xpath('//description/text()').extract()[1]
text_link = response.xpath('//link/text()').extract()[2]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = page_title
item['summary'] = page_summary
item['link'] = text_link
yield item
import scrapy
class WebspiderItem(scrapy.Item):
# define the fields for your item here like:
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='passordpassord',
database='news_db'
)
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
管道。PY
from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem
class NewsSpider(scrapy.Spider):
name = 'news'
start_urls = [
'https://www.coindesk.com/feed'
]
def parse(self, response):
pub_date = response.xpath('//pubDate/text()').extract()[0]
page_title = response.xpath('//title/text()').extract()[2]
page_summary = response.xpath('//description/text()').extract()[1]
text_link = response.xpath('//link/text()').extract()[2]
item = WebspiderItem()
item['date'] = pub_date
item['title'] = page_title
item['summary'] = page_summary
item['link'] = text_link
yield item
import scrapy
class WebspiderItem(scrapy.Item):
# define the fields for your item here like:
date = scrapy.Field()
title = scrapy.Field()
summary = scrapy.Field()
link = scrapy.Field()
import mysql.connector
class WebspiderPipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
self.conn = mysql.connector.connect(
host='localhost',
user='root',
passwd='passordpassord',
database='news_db'
)
self.curr = self.conn.cursor()
def process_item(self, item, spider):
self.store_db(item)
return item
def store_db(self, item):
self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
item['date'],
item['title'],
item['summary'],
item['link']
))
self.conn.commit()
在Windows本地和Linux服务器上都可以使用。只需使用pip安装计划安装它
。然后通过将以下内容粘贴到main.py
文件中来设置新作业:
import schedule
import time
import os
print('Scheduler initialised')
schedule.every(5).minutes.do(lambda: os.system('scrapy crawl news'))
print('Next job is set to run at: ' + str(schedule.next_run()))
while True:
schedule.run_pending()
time.sleep(1)
然后在终端中运行python main.py
。只要您不关闭终端,脚本将每隔5分钟运行一次scrapy crawl news
命令
请注意,使用os.system()
而不是cmdline.execute()
非常重要,因为据我回忆,cmdline.execute()
在作业完成时退出调度程序的无限while循环os.system()
不会执行此操作,因此将在再过5分钟后等待另一个作业运行。在Windows本地和Linux服务器上使用此操作对我都有效。只需使用pip安装计划安装它
。然后通过将以下内容粘贴到main.py
文件中来设置新作业:
import schedule
import time
import os
print('Scheduler initialised')
schedule.every(5).minutes.do(lambda: os.system('scrapy crawl news'))
print('Next job is set to run at: ' + str(schedule.next_run()))
while True:
schedule.run_pending()
time.sleep(1)
然后在终端中运行python main.py
。只要您不关闭终端,脚本将每隔5分钟运行一次scrapy crawl news
命令
请注意,使用
os.system()
而不是cmdline.execute()
非常重要,因为据我回忆,cmdline.execute()
在作业完成时退出调度程序的无限while循环os.system()
不会执行此操作,因此会在又过了5分钟后等待另一个作业运行。是的,Linux的一个优点是cron。Windows在标准WinOS中没有类似的功能。任务调度器存在,但它几乎没有那么灵活。通过谷歌搜索找到了克洛诺,但我没有这方面的经验。或者您可以在计算机上安装Jenkins实例。当然,更重要的是,你可以自己延迟5分钟,然后重新开始。@FrankMerrow,你用Anaconda和cron做到了吗?我一直在与这个特定的配置进行斗争。我认为这只是一个黑客攻击,所以密码在最终版本中不会公开。在我自己的过去,我把它作为DOS环境变量和/或ROT13隐藏起来,至少让“肩上冲浪”更难。还有一些Windows API可以对特定机器进行加密,因此只有能够访问您的物理机器的人才能对其进行解码。Windows在标准WinOS中没有类似的功能。任务调度器存在,但它几乎没有那么灵活。通过谷歌搜索找到了克洛诺,但我没有这方面的经验。或者您可以在计算机上安装Jenkins实例。当然,更重要的是,你可以自己延迟5分钟,然后重新开始。@FrankMerrow,你用Anaconda和cron做到了吗?我一直在与这个特定的配置进行斗争。我认为这只是一个黑客攻击,所以密码在最终版本中不会公开。在我自己的过去,我把它作为DOS环境变量和/或ROT13隐藏起来,至少让“肩上冲浪”更难。还有一些Windows API可以对特定机器进行加密,因此只有能够访问您的物理机器的人才能对其进行解码。