Python 如何安排spider每5分钟运行一次?

Python 如何安排spider每5分钟运行一次?,python,scrapy,scheduled-tasks,scrapy-pipeline,Python,Scrapy,Scheduled Tasks,Scrapy Pipeline,几天来,我一直在想如何安排我的刮痧蜘蛛,但一点运气都没有。(我尝试了从Windows任务调度器到scrapy do lib的所有方法,但在我的MAIN.PY上没有任何效果) (我的主要目标是安排我的spider每隔5分钟从我的spiderNewsSpider收集数据到mySQLnews\u db数据库) 请看我的脚本,因为它有点修改,如果需要,请更改它。我真的希望这能奏效 MAIN.PY from scrapy import cmdline cmdline.execute("scrapy cra

几天来,我一直在想如何安排我的刮痧蜘蛛,但一点运气都没有。(我尝试了从Windows任务调度器到scrapy do lib的所有方法,但在我的MAIN.PY上没有任何效果)

(我的主要目标是安排我的spider每隔5分钟从我的spiderNewsSpider收集数据到mySQLnews\u db数据库)

请看我的脚本,因为它有点修改,如果需要,请更改它。我真的希望这能奏效

MAIN.PY

from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem


class NewsSpider(scrapy.Spider):
    name = 'news'
    start_urls = [
        'https://www.coindesk.com/feed'
    ]

    def parse(self, response):
        pub_date = response.xpath('//pubDate/text()').extract()[0]
        page_title = response.xpath('//title/text()').extract()[2]
        page_summary = response.xpath('//description/text()').extract()[1]
        text_link = response.xpath('//link/text()').extract()[2]

        item = WebspiderItem()
        item['date'] = pub_date
        item['title'] = page_title
        item['summary'] = page_summary
        item['link'] = text_link

        yield item
import scrapy


class WebspiderItem(scrapy.Item):
    # define the fields for your item here like:
    date = scrapy.Field()
    title = scrapy.Field()
    summary = scrapy.Field()
    link = scrapy.Field()
import mysql.connector


class WebspiderPipeline(object):

    def __init__(self):
        self.create_connection()

    def create_connection(self):
        self.conn = mysql.connector.connect(
            host='localhost',
            user='root',
            passwd='passordpassord',
            database='news_db'
        )
        self.curr = self.conn.cursor()

    def process_item(self, item, spider):
        self.store_db(item)
        return item

    def store_db(self, item):
        self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
            item['date'],
            item['title'],
            item['summary'],
            item['link']

        ))
        self.conn.commit()

新闻\u SPIDER.PY

from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem


class NewsSpider(scrapy.Spider):
    name = 'news'
    start_urls = [
        'https://www.coindesk.com/feed'
    ]

    def parse(self, response):
        pub_date = response.xpath('//pubDate/text()').extract()[0]
        page_title = response.xpath('//title/text()').extract()[2]
        page_summary = response.xpath('//description/text()').extract()[1]
        text_link = response.xpath('//link/text()').extract()[2]

        item = WebspiderItem()
        item['date'] = pub_date
        item['title'] = page_title
        item['summary'] = page_summary
        item['link'] = text_link

        yield item
import scrapy


class WebspiderItem(scrapy.Item):
    # define the fields for your item here like:
    date = scrapy.Field()
    title = scrapy.Field()
    summary = scrapy.Field()
    link = scrapy.Field()
import mysql.connector


class WebspiderPipeline(object):

    def __init__(self):
        self.create_connection()

    def create_connection(self):
        self.conn = mysql.connector.connect(
            host='localhost',
            user='root',
            passwd='passordpassord',
            database='news_db'
        )
        self.curr = self.conn.cursor()

    def process_item(self, item, spider):
        self.store_db(item)
        return item

    def store_db(self, item):
        self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
            item['date'],
            item['title'],
            item['summary'],
            item['link']

        ))
        self.conn.commit()

ITEMS.PY

from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem


class NewsSpider(scrapy.Spider):
    name = 'news'
    start_urls = [
        'https://www.coindesk.com/feed'
    ]

    def parse(self, response):
        pub_date = response.xpath('//pubDate/text()').extract()[0]
        page_title = response.xpath('//title/text()').extract()[2]
        page_summary = response.xpath('//description/text()').extract()[1]
        text_link = response.xpath('//link/text()').extract()[2]

        item = WebspiderItem()
        item['date'] = pub_date
        item['title'] = page_title
        item['summary'] = page_summary
        item['link'] = text_link

        yield item
import scrapy


class WebspiderItem(scrapy.Item):
    # define the fields for your item here like:
    date = scrapy.Field()
    title = scrapy.Field()
    summary = scrapy.Field()
    link = scrapy.Field()
import mysql.connector


class WebspiderPipeline(object):

    def __init__(self):
        self.create_connection()

    def create_connection(self):
        self.conn = mysql.connector.connect(
            host='localhost',
            user='root',
            passwd='passordpassord',
            database='news_db'
        )
        self.curr = self.conn.cursor()

    def process_item(self, item, spider):
        self.store_db(item)
        return item

    def store_db(self, item):
        self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
            item['date'],
            item['title'],
            item['summary'],
            item['link']

        ))
        self.conn.commit()

管道。PY

from scrapy import cmdline
cmdline.execute("scrapy crawl news".split())
import scrapy
from ..items import WebspiderItem


class NewsSpider(scrapy.Spider):
    name = 'news'
    start_urls = [
        'https://www.coindesk.com/feed'
    ]

    def parse(self, response):
        pub_date = response.xpath('//pubDate/text()').extract()[0]
        page_title = response.xpath('//title/text()').extract()[2]
        page_summary = response.xpath('//description/text()').extract()[1]
        text_link = response.xpath('//link/text()').extract()[2]

        item = WebspiderItem()
        item['date'] = pub_date
        item['title'] = page_title
        item['summary'] = page_summary
        item['link'] = text_link

        yield item
import scrapy


class WebspiderItem(scrapy.Item):
    # define the fields for your item here like:
    date = scrapy.Field()
    title = scrapy.Field()
    summary = scrapy.Field()
    link = scrapy.Field()
import mysql.connector


class WebspiderPipeline(object):

    def __init__(self):
        self.create_connection()

    def create_connection(self):
        self.conn = mysql.connector.connect(
            host='localhost',
            user='root',
            passwd='passordpassord',
            database='news_db'
        )
        self.curr = self.conn.cursor()

    def process_item(self, item, spider):
        self.store_db(item)
        return item

    def store_db(self, item):
        self.curr.execute("""insert into news_tb values (%s, %s, %s, %s)""", (
            item['date'],
            item['title'],
            item['summary'],
            item['link']

        ))
        self.conn.commit()

在Windows本地和Linux服务器上都可以使用。只需使用
pip安装计划安装它
。然后通过将以下内容粘贴到
main.py
文件中来设置新作业:

import schedule
import time
import os

print('Scheduler initialised')
schedule.every(5).minutes.do(lambda: os.system('scrapy crawl news'))
print('Next job is set to run at: ' + str(schedule.next_run()))

while True:
    schedule.run_pending()
    time.sleep(1)
然后在终端中运行
python main.py
。只要您不关闭终端,脚本将每隔5分钟运行一次
scrapy crawl news
命令

请注意,使用
os.system()
而不是
cmdline.execute()
非常重要,因为据我回忆,
cmdline.execute()
在作业完成时退出调度程序的无限while循环
os.system()
不会执行此操作,因此将在再过5分钟后等待另一个作业运行。

在Windows本地和Linux服务器上使用此操作对我都有效。只需使用
pip安装计划安装它
。然后通过将以下内容粘贴到
main.py
文件中来设置新作业:

import schedule
import time
import os

print('Scheduler initialised')
schedule.every(5).minutes.do(lambda: os.system('scrapy crawl news'))
print('Next job is set to run at: ' + str(schedule.next_run()))

while True:
    schedule.run_pending()
    time.sleep(1)
然后在终端中运行
python main.py
。只要您不关闭终端,脚本将每隔5分钟运行一次
scrapy crawl news
命令


请注意,使用
os.system()
而不是
cmdline.execute()
非常重要,因为据我回忆,
cmdline.execute()
在作业完成时退出调度程序的无限while循环
os.system()
不会执行此操作,因此会在又过了5分钟后等待另一个作业运行。

是的,Linux的一个优点是cron。Windows在标准WinOS中没有类似的功能。任务调度器存在,但它几乎没有那么灵活。通过谷歌搜索找到了克洛诺,但我没有这方面的经验。或者您可以在计算机上安装Jenkins实例。当然,更重要的是,你可以自己延迟5分钟,然后重新开始。@FrankMerrow,你用Anaconda和cron做到了吗?我一直在与这个特定的配置进行斗争。我认为这只是一个黑客攻击,所以密码在最终版本中不会公开。在我自己的过去,我把它作为DOS环境变量和/或ROT13隐藏起来,至少让“肩上冲浪”更难。还有一些Windows API可以对特定机器进行加密,因此只有能够访问您的物理机器的人才能对其进行解码。Windows在标准WinOS中没有类似的功能。任务调度器存在,但它几乎没有那么灵活。通过谷歌搜索找到了克洛诺,但我没有这方面的经验。或者您可以在计算机上安装Jenkins实例。当然,更重要的是,你可以自己延迟5分钟,然后重新开始。@FrankMerrow,你用Anaconda和cron做到了吗?我一直在与这个特定的配置进行斗争。我认为这只是一个黑客攻击,所以密码在最终版本中不会公开。在我自己的过去,我把它作为DOS环境变量和/或ROT13隐藏起来,至少让“肩上冲浪”更难。还有一些Windows API可以对特定机器进行加密,因此只有能够访问您的物理机器的人才能对其进行解码。