Python脚本使用while循环不断更新作业脚本，并对队列中的任务进行多处理_Python_Windows_Multithreading_Multiprocessing

Python脚本使用while循环不断更新作业脚本，并对队列中的任务进行多处理

python windows multithreading

Python脚本使用while循环不断更新作业脚本，并对队列中的任务进行多处理,python,windows,multithreading,multiprocessing,Python,Windows,Multithreading,Multiprocessing,我正在尝试编写一个python脚本，扫描文件夹并收集更新的SQL脚本，然后自动提取SQL脚本的数据。在代码中，while循环正在扫描新的SQL文件，并发送到数据拉取函数。我很难理解如何使用while循环创建动态队列，但也无法使用多进程来运行队列中的任务下面的代码有一个问题，while循环迭代将在一个长作业上工作，然后再移动到下一个迭代，并收集其他作业以填充空闲的处理器更新：感谢@pbacterio捕捉到错误，现在错误信息消失了。更改代码后，python代码可以在一次迭代中获取所有作业脚本，

我正在尝试编写一个python脚本，扫描文件夹并收集更新的SQL脚本，然后自动提取SQL脚本的数据。在代码中，while循环正在扫描新的SQL文件，并发送到数据拉取函数。我很难理解如何使用while循环创建动态队列，但也无法使用多进程来运行队列中的任务

下面的代码有一个问题，while循环迭代将在一个长作业上工作，然后再移动到下一个迭代，并收集其他作业以填充空闲的处理器

更新：

感谢@pbacterio捕捉到错误，现在错误信息消失了。更改代码后，python代码可以在一次迭代中获取所有作业脚本，并将脚本分发到四个处理器。然而，它将被一个长任务挂起，以便进入下一个迭代，扫描并提交新添加的作业脚本。知道如何重构代码吗

我终于找到了解决办法，请参见下面的答案。原来我要找的是

_queue=queue（）
_pool=pool（4，worker_main，（_队列，）

对于那些偶然发现类似想法的人，下面是这个自动化脚本的整个体系结构，它将共享驱动器转换为“用于SQL提取的服务器”或任何其他作业队列“服务器”

a。python脚本

auto\u data\u pull.py

，如答案所示。您需要添加自己的职务职能

b。具有以下内容的“批处理脚本”：

启动C:\Anaconda2\python.exe C:\Users\bin\auto\u data\u pull.py

c。添加由启动计算机触发的任务，运行“批处理脚本” 这就是全部。它起作用了

Python代码：

from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support

#
# Function run by worker processes
#

def worker(input, output):
    for func, args in iter(input.get, 'STOP'):
        result = compute(func, args)
        output.put(result)

#
# Function used to compute result
#

def compute(func, args):
    result = func(args)
    return '%s says that %s%s = %s' % \
        (current_process().name, func.__name__, args, result)


def query_sql(sql_file): #test func
    #jsl file processing and SQL querying, data table will be saved to csv.
    fo_name = os.path.splitext(sql_file)[0] + '.csv'
    fo = open(fo_name, 'w')
    print sql_file
    fo.write("sql_file {0} is done\n".format(sql_file))
    return "Query is done for \n".format(sql_file)


def check_files(path):
    """
    arguments -- root path to monitor
    returns   -- dictionary of {file: timestamp, ...}
    """
    sql_query_dirs = glob(path + "/*/IDABox/")

    files_dict = {}
    for sql_query_dir in sql_query_dirs:
        for root, dirs, filenames in os.walk(sql_query_dir):
            [files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for 
                     filename in filenames if filename.endswith('.jsl')]
    return files_dict


##### working in single thread
def single_thread():
    path = "Y:/"

    before = check_files(path)
    sql_queue  = [] 

    while True:
        time.sleep(3)
        after = check_files(path)
        added = [f for f in after if not f in before]
        deleted = [f for f in before if not f in after]
        overlapped = list(set(list(after)) & set(list(before)))
        updated = [f for f in overlapped if before[f] < after[f]]  

        before = after

        sql_queue = added + updated
        # print sql_queue
        for sql_file in sql_queue:
            try:
                query_sql(sql_file)
            except:
                pass


##### not working in queue
def multiple_thread():

    NUMBER_OF_PROCESSES = 4
    path = "Y:/"

    sql_queue  = [] 
    before = check_files(path) # get the current dictionary of sql_files
    task_queue = Queue()
    done_queue = Queue()

    while True:         #while loop to check the changes of the files
        time.sleep(5)
        after = check_files(path)
        added = [f for f in after if not f in before]
        deleted = [f for f in before if not f in after]
        overlapped = list(set(list(after)) & set(list(before)))
        updated = [f for f in overlapped if before[f] < after[f]]  

        before = after  
        sql_queue = added + updated   

        TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
        # Create queues

        #submit task
        for task in TASKS:
            task_queue.put(task)

        for i in range(NUMBER_OF_PROCESSES):
                p = Process(target=worker, args=(task_queue, done_queue)).start()          
            # try:
            #     p = Process(target=worker, args=(task_queue))
            #     p.start()

            # except:
            #     pass 

        # Get and print results
        print 'Unordered results:'
        for i in range(len(TASKS)):
            print '\t', done_queue.get()
        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            task_queue.put('STOP')        

# single_thread()
if __name__ == '__main__':
    # freeze_support()
    multiple_thread()

从全局导入全局
导入操作系统，时间
导入系统
导入CSV
进口稀土
导入子流程
作为PD进口熊猫
导入PyODBC
来自多处理导入进程、队列、当前\u进程、冻结\u支持
#
#由辅助进程运行的函数
#
def工作者（输入、输出）：
对于func，iter中的args（input.get，“STOP”）：
结果=计算（func，args）
输出输出（结果）
#
#用于计算结果的函数
#
def计算（函数，参数）：
结果=func（args）
返回“%s”表示%s%s=%s'%\
（当前进程（）.名称，函数。\名称，参数，结果）
定义查询sql（sql文件）：#测试函数
#jsl文件处理和SQL查询，数据表将保存到csv。
fo_name=os.path.splitext（sql_文件）[0]+'.csv'
fo=打开（fo_名称'w'）
打印sql\u文件
write（“sql_文件{0}已完成\n”.format（sql_文件））
返回“查询已完成”。格式（sql\U文件）
def check_文件（路径）：
"""
参数--监视器的根路径
返回--{file:timestamp，…}的字典
"""
sql\u query\u dirs=glob（路径+“/*/IDABox/”）
文件\u dict={}
对于sql查询目录中的sql查询目录：
对于os.walk（sql\u query\u dir）中的根目录、目录和文件名：
[files_dict.update（{（root+filename）：os.path.getmtime（root+filename）}）用于
如果filename.endswith（'.jsl'），则文件名中的filename
返回文件
#####单线程工作
def单螺纹（）
path=“Y:/”
before=检查\u文件（路径）
sql_队列=[]
尽管如此：
时间。睡眠（3）
after=检查\u文件（路径）
已添加=[f表示f在后面，如果不是f在前面]
已删除=[f代表f在前，如果不是f在后]
重叠=列表（设置（列表（之后））&设置（列表（之前）））
更新=[f为f，如果在[f]<在[f]]之后重叠
前后
sql_队列=已添加+已更新
#打印sql\u队列
对于sql_队列中的sql_文件：
尝试：
查询sql（sql文件）
除：
通过
#####不排队工作
def multiple_thread（）：
进程数=4
path=“Y:/”
sql_队列=[]
before=检查_文件（路径）#获取sql_文件的当前字典
任务队列=队列（）
完成\u队列=队列（）
while True:#while循环检查文件的更改
时间。睡眠（5）
after=检查\u文件（路径）
已添加=[f表示f在后面，如果不是f在前面]
已删除=[f代表f在前，如果不是f在后]
重叠=列表（设置（列表（之后））&设置（列表（之前）））
更新=[f为f，如果在[f]<在[f]]之后重叠
前后
sql_队列=已添加+已更新
TASKS=[（查询sql，sql文件）用于sql队列中的sql文件]
#创建队列
#提交任务
对于任务中的任务：
任务队列.放置（任务）
对于范围内的i（进程数）：
p=进程（目标=工作者，参数=（任务队列，完成队列））.start（）
#尝试：
#p=进程（目标=工作者，参数=（任务队列））
#p.开始（）
#除：
#通过
#获取并打印结果
打印“无序结果：”
对于范围内的i（len（TASKS））：
打印'\t'，完成\u队列。获取（）
#告诉子进程停止
对于范围内的i（进程数）：
任务队列。put（'STOP'）
#单螺纹
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
#冻结支持（）
多线程（）

参考资料：

from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support

#
# Function run by worker processes
#

def worker(input, output):
    for func, args in iter(input.get, 'STOP'):
        result = compute(func, args)
        output.put(result)

#
# Function used to compute result
#

def compute(func, args):
    result = func(args)
    return '%s says that %s%s = %s' % \
        (current_process().name, func.__name__, args, result)


def query_sql(sql_file): #test func
    #jsl file processing and SQL querying, data table will be saved to csv.
    fo_name = os.path.splitext(sql_file)[0] + '.csv'
    fo = open(fo_name, 'w')
    print sql_file
    fo.write("sql_file {0} is done\n".format(sql_file))
    return "Query is done for \n".format(sql_file)


def check_files(path):
    """
    arguments -- root path to monitor
    returns   -- dictionary of {file: timestamp, ...}
    """
    sql_query_dirs = glob(path + "/*/IDABox/")

    files_dict = {}
    for sql_query_dir in sql_query_dirs:
        for root, dirs, filenames in os.walk(sql_query_dir):
            [files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for 
                     filename in filenames if filename.endswith('.jsl')]
    return files_dict


##### working in single thread
def single_thread():
    path = "Y:/"

    before = check_files(path)
    sql_queue  = [] 

    while True:
        time.sleep(3)
        after = check_files(path)
        added = [f for f in after if not f in before]
        deleted = [f for f in before if not f in after]
        overlapped = list(set(list(after)) & set(list(before)))
        updated = [f for f in overlapped if before[f] < after[f]]  

        before = after

        sql_queue = added + updated
        # print sql_queue
        for sql_file in sql_queue:
            try:
                query_sql(sql_file)
            except:
                pass


##### not working in queue
def multiple_thread():

    NUMBER_OF_PROCESSES = 4
    path = "Y:/"

    sql_queue  = [] 
    before = check_files(path) # get the current dictionary of sql_files
    task_queue = Queue()
    done_queue = Queue()

    while True:         #while loop to check the changes of the files
        time.sleep(5)
        after = check_files(path)
        added = [f for f in after if not f in before]
        deleted = [f for f in before if not f in after]
        overlapped = list(set(list(after)) & set(list(before)))
        updated = [f for f in overlapped if before[f] < after[f]]  

        before = after  
        sql_queue = added + updated   

        TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
        # Create queues

        #submit task
        for task in TASKS:
            task_queue.put(task)

        for i in range(NUMBER_OF_PROCESSES):
                p = Process(target=worker, args=(task_queue, done_queue)).start()          
            # try:
            #     p = Process(target=worker, args=(task_queue))
            #     p.start()

            # except:
            #     pass 

        # Get and print results
        print 'Unordered results:'
        for i in range(len(TASKS)):
            print '\t', done_queue.get()
        # Tell child processes to stop
        for i in range(NUMBER_OF_PROCESSES):
            task_queue.put('STOP')        

# single_thread()
if __name__ == '__main__':
    # freeze_support()
    multiple_thread()

使用python脚本监视文件更改：

多处理：

您在中的

multiple\u thread（）

中在哪里定义了

sql\u文件

multiprocessing.Process(target=query_sql, args=(sql_file)).start()

您没有在方法中定义

sql\u文件

，而且在for循环中使用了该变量。变量的作用域仅限于for循环。

尝试替换以下内容：

result = func(*args)

据此：

result = func(args)

我已经弄明白了。谢谢你的回应激发了我的想法。现在，脚本可以运行while循环来监视文件夹中新更新/添加的SQL脚本，然后分发数据