Python脚本使用while循环不断更新作业脚本,并对队列中的任务进行多处理
我正在尝试编写一个python脚本,扫描文件夹并收集更新的SQL脚本,然后自动提取SQL脚本的数据。在代码中,while循环正在扫描新的SQL文件,并发送到数据拉取函数。我很难理解如何使用while循环创建动态队列,但也无法使用多进程来运行队列中的任务 下面的代码有一个问题,while循环迭代将在一个长作业上工作,然后再移动到下一个迭代,并收集其他作业以填充空闲的处理器 更新:Python脚本使用while循环不断更新作业脚本,并对队列中的任务进行多处理,python,windows,multithreading,multiprocessing,Python,Windows,Multithreading,Multiprocessing,我正在尝试编写一个python脚本,扫描文件夹并收集更新的SQL脚本,然后自动提取SQL脚本的数据。在代码中,while循环正在扫描新的SQL文件,并发送到数据拉取函数。我很难理解如何使用while循环创建动态队列,但也无法使用多进程来运行队列中的任务 下面的代码有一个问题,while循环迭代将在一个长作业上工作,然后再移动到下一个迭代,并收集其他作业以填充空闲的处理器 更新: 感谢@pbacterio捕捉到错误,现在错误信息消失了。更改代码后,python代码可以在一次迭代中获取所有作业脚本,
_pool=pool(4,worker_main,(_队列,)
auto\u data\u pull.py
,如答案所示。您需要添加自己的职务职能
b。具有以下内容的“批处理脚本”:
启动C:\Anaconda2\python.exe C:\Users\bin\auto\u data\u pull.py
c。添加由启动计算机触发的任务,运行“批处理脚本”
这就是全部。它起作用了from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support
#
# Function run by worker processes
#
def worker(input, output):
for func, args in iter(input.get, 'STOP'):
result = compute(func, args)
output.put(result)
#
# Function used to compute result
#
def compute(func, args):
result = func(args)
return '%s says that %s%s = %s' % \
(current_process().name, func.__name__, args, result)
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
return files_dict
##### working in single thread
def single_thread():
path = "Y:/"
before = check_files(path)
sql_queue = []
while True:
time.sleep(3)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
# print sql_queue
for sql_file in sql_queue:
try:
query_sql(sql_file)
except:
pass
##### not working in queue
def multiple_thread():
NUMBER_OF_PROCESSES = 4
path = "Y:/"
sql_queue = []
before = check_files(path) # get the current dictionary of sql_files
task_queue = Queue()
done_queue = Queue()
while True: #while loop to check the changes of the files
time.sleep(5)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
# Create queues
#submit task
for task in TASKS:
task_queue.put(task)
for i in range(NUMBER_OF_PROCESSES):
p = Process(target=worker, args=(task_queue, done_queue)).start()
# try:
# p = Process(target=worker, args=(task_queue))
# p.start()
# except:
# pass
# Get and print results
print 'Unordered results:'
for i in range(len(TASKS)):
print '\t', done_queue.get()
# Tell child processes to stop
for i in range(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
# single_thread()
if __name__ == '__main__':
# freeze_support()
multiple_thread()
从全局导入全局
导入操作系统,时间
导入系统
导入CSV
进口稀土
导入子流程
作为PD进口熊猫
导入PyODBC
来自多处理导入进程、队列、当前\u进程、冻结\u支持
#
#由辅助进程运行的函数
#
def工作者(输入、输出):
对于func,iter中的args(input.get,“STOP”):
结果=计算(func,args)
输出输出(结果)
#
#用于计算结果的函数
#
def计算(函数,参数):
结果=func(args)
返回“%s”表示%s%s=%s'%\
(当前进程().名称,函数。\名称,参数,结果)
定义查询sql(sql文件):#测试函数
#jsl文件处理和SQL查询,数据表将保存到csv。
fo_name=os.path.splitext(sql_文件)[0]+'.csv'
fo=打开(fo_名称'w')
打印sql\u文件
write(“sql_文件{0}已完成\n”.format(sql_文件))
返回“查询已完成”。格式(sql\U文件)
def check_文件(路径):
"""
参数--监视器的根路径
返回--{file:timestamp,…}的字典
"""
sql\u query\u dirs=glob(路径+“/*/IDABox/”)
文件\u dict={}
对于sql查询目录中的sql查询目录:
对于os.walk(sql\u query\u dir)中的根目录、目录和文件名:
[files_dict.update({(root+filename):os.path.getmtime(root+filename)})用于
如果filename.endswith('.jsl'),则文件名中的filename
返回文件
#####单线程工作
def单螺纹()
path=“Y:/”
before=检查\u文件(路径)
sql_队列=[]
尽管如此:
时间。睡眠(3)
after=检查\u文件(路径)
已添加=[f表示f在后面,如果不是f在前面]
已删除=[f代表f在前,如果不是f在后]
重叠=列表(设置(列表(之后))&设置(列表(之前)))
更新=[f为f,如果在[f]<在[f]]之后重叠
前后
sql_队列=已添加+已更新
#打印sql\u队列
对于sql_队列中的sql_文件:
尝试:
查询sql(sql文件)
除:
通过
#####不排队工作
def multiple_thread():
进程数=4
path=“Y:/”
sql_队列=[]
before=检查_文件(路径)#获取sql_文件的当前字典
任务队列=队列()
完成\u队列=队列()
while True:#while循环检查文件的更改
时间。睡眠(5)
after=检查\u文件(路径)
已添加=[f表示f在后面,如果不是f在前面]
已删除=[f代表f在前,如果不是f在后]
重叠=列表(设置(列表(之后))&设置(列表(之前)))
更新=[f为f,如果在[f]<在[f]]之后重叠
前后
sql_队列=已添加+已更新
TASKS=[(查询sql,sql文件)用于sql队列中的sql文件]
#创建队列
#提交任务
对于任务中的任务:
任务队列.放置(任务)
对于范围内的i(进程数):
p=进程(目标=工作者,参数=(任务队列,完成队列)).start()
#尝试:
#p=进程(目标=工作者,参数=(任务队列))
#p.开始()
#除:
#通过
#获取并打印结果
打印“无序结果:”
对于范围内的i(len(TASKS)):
打印'\t',完成\u队列。获取()
#告诉子进程停止
对于范围内的i(进程数):
任务队列。put('STOP')
#单螺纹
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu':
#冻结支持()
多线程()
参考资料:
from glob import glob
import os, time
import sys
import CSV
import re
import subprocess
import pandas as PD
import pypyodbc
from multiprocessing import Process, Queue, current_process, freeze_support
#
# Function run by worker processes
#
def worker(input, output):
for func, args in iter(input.get, 'STOP'):
result = compute(func, args)
output.put(result)
#
# Function used to compute result
#
def compute(func, args):
result = func(args)
return '%s says that %s%s = %s' % \
(current_process().name, func.__name__, args, result)
def query_sql(sql_file): #test func
#jsl file processing and SQL querying, data table will be saved to csv.
fo_name = os.path.splitext(sql_file)[0] + '.csv'
fo = open(fo_name, 'w')
print sql_file
fo.write("sql_file {0} is done\n".format(sql_file))
return "Query is done for \n".format(sql_file)
def check_files(path):
"""
arguments -- root path to monitor
returns -- dictionary of {file: timestamp, ...}
"""
sql_query_dirs = glob(path + "/*/IDABox/")
files_dict = {}
for sql_query_dir in sql_query_dirs:
for root, dirs, filenames in os.walk(sql_query_dir):
[files_dict.update({(root + filename): os.path.getmtime(root + filename)}) for
filename in filenames if filename.endswith('.jsl')]
return files_dict
##### working in single thread
def single_thread():
path = "Y:/"
before = check_files(path)
sql_queue = []
while True:
time.sleep(3)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
# print sql_queue
for sql_file in sql_queue:
try:
query_sql(sql_file)
except:
pass
##### not working in queue
def multiple_thread():
NUMBER_OF_PROCESSES = 4
path = "Y:/"
sql_queue = []
before = check_files(path) # get the current dictionary of sql_files
task_queue = Queue()
done_queue = Queue()
while True: #while loop to check the changes of the files
time.sleep(5)
after = check_files(path)
added = [f for f in after if not f in before]
deleted = [f for f in before if not f in after]
overlapped = list(set(list(after)) & set(list(before)))
updated = [f for f in overlapped if before[f] < after[f]]
before = after
sql_queue = added + updated
TASKS = [(query_sql, sql_file) for sql_file in sql_queue]
# Create queues
#submit task
for task in TASKS:
task_queue.put(task)
for i in range(NUMBER_OF_PROCESSES):
p = Process(target=worker, args=(task_queue, done_queue)).start()
# try:
# p = Process(target=worker, args=(task_queue))
# p.start()
# except:
# pass
# Get and print results
print 'Unordered results:'
for i in range(len(TASKS)):
print '\t', done_queue.get()
# Tell child processes to stop
for i in range(NUMBER_OF_PROCESSES):
task_queue.put('STOP')
# single_thread()
if __name__ == '__main__':
# freeze_support()
multiple_thread()
您在中的
multiple\u thread()
中在哪里定义了sql\u文件
multiprocessing.Process(target=query_sql, args=(sql_file)).start()
您没有在方法中定义sql\u文件
,而且在for循环中使用了该变量。变量的作用域仅限于for循环。尝试替换以下内容:
result = func(*args)
据此:
result = func(args)
我已经弄明白了。谢谢你的回应激发了我的想法。 现在,脚本可以运行while循环来监视文件夹中新更新/添加的SQL脚本,然后分发数据