Python 在另一个循环中多处理一个循环_Python_Multiprocessing

Python 在另一个循环中多处理一个循环

python

Python 在另一个循环中多处理一个循环,python,multiprocessing,Python,Multiprocessing,此代码一次处理多个PDF。在每个PDF中，它使用opencv在每个页面中循环，以检测每个页面是否是4up页面与1up页面，如果是4up页面，则循环将该页面的索引附加到名为“ind”的列表中。由于处理页面的顺序无关紧要，因此我希望对opencv检测进行多处理。但我是Python的新手，所以似乎无法使用池函数使其工作。这是原始代码（单线程）从pdf2image导入从路径转换进口cv2 将numpy作为np导入 pdffiles='sample.pdf' 对于PDF中的p： pages=从路径转

此代码一次处理多个PDF。在每个PDF中，它使用opencv在每个页面中循环，以检测每个页面是否是4up页面与1up页面，如果是4up页面，则循环将该页面的索引附加到名为“ind”的列表中。由于处理页面的顺序无关紧要，因此我希望对opencv检测进行多处理。但我是Python的新手，所以似乎无法使用池函数使其工作。这是原始代码（单线程）

从pdf2image导入从路径转换
进口cv2
将numpy作为np导入
pdffiles='sample.pdf'
对于PDF中的p：
pages=从路径转换路径（Pdfiles）
ind=[]
对于范围内的i（len（页数））：
第页=第[i]页
灰色=np.数组（第页）
灰色=cv2.CVT颜色（灰色，cv2.COLOR\u BGR2GRAY）
（阈值，bw）=cv2.阈值（灰色，0，255，cv2.thresh_BINARY | cv2.thresh_OTSU）
边缘=cv2.Canny（bw，0255）
minLineLength=850
lines=cv2.HoughLinesP（图像=边缘，ρ=5，θ=np.pi/90，阈值=1000，lines=np.array（[]），minLineLength=minLineLength，maxLineGap=3）
linessub=行[（（行[：，0,0]>750）和（行[：，0,0]1000）和（行[：，0,1]1:ind.append（i）
打印（ind）

编辑：我已经简化了我发布的内容，只包含了我试图多处理的部分。你应该能够在我上传到这里的sample.pdf上运行代码：在这个文件上，打印的ind应该是

[1,2]

有两种方法可以同时处理事情。多处理和线程。你必须尝试哪种方法更适合你

线程化

这是一个基本的例子，可以帮助你

import threading

results = []
threads = []


def task(arg):
    results.append(arg)


for i in range(10):
    t = threading.Thread(target=task)
    threads.append(t)
    t.start()

print results

下面是一个应用于您的代码的示例。我无法运行您的示例，因此未对其进行测试

import threading
import timeit

curdir = os.path.dirname(os.path.realpath(__file__))
os.chdir(curdir)
files = os.listdir(curdir)
if os.environ.get('OS','') == 'Windows_NT':
    dstdir = os.path.join(curdir, '1up\\')
else:
    dstdir = os.path.join(curdir, '1up/')
if not os.path.exists(dstdir):
   os.makedirs(dstdir)
pdffiles = [f for f in files if f.endswith('.pdf')]

ind = []

def 4up_detect(pages):
    gray = np.array(pages)
    gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
    (thresh, bw) = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    edges = cv2.Canny(bw,0,255)
    minLineLength=1100
    lines = cv2.HoughLinesP(image=edges,rho=5,theta=np.pi/90, threshold=1000,lines=np.array([]), minLineLength=minLineLength,maxLineGap=3)
    linessub = lines[((lines[:,0,0]>750) & (lines[:,0,0]<950))|((lines[:,0,1]>1000) & (lines[:,0,1]<1200))]
    if len(linessub) > 1: 
        ind.append(pages)



threads = []

for p in pdffiles: 
    pages=convert_from_path(p)  
    t = threading.Thread(target=4up_detect, args=[pages])
    threads.append(t)
    t.start()

startpg = min(ind)
endpg = max(ind)
page = pages[startpg]
image = np.array(page)
height = int(math.floor(image.shape[0])/2)
width = int(math.floor(image.shape[1])/2)

这大概就是您的代码中可能出现的情况。但是，我再次无法运行它：

import multiprocessing
import timeit


def 4up_detect(pages, results):
    gray = np.array(pages)
    gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
    (thresh, bw) = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    edges = cv2.Canny(bw,0,255)
    minLineLength=1100
    lines = cv2.HoughLinesP(image=edges,rho=5,theta=np.pi/90, threshold=1000,lines=np.array([]), minLineLength=minLineLength,maxLineGap=3)
    linessub = lines[((lines[:,0,0]>750) & (lines[:,0,0]<950))|((lines[:,0,1]>1000) & (lines[:,0,1]<1200))]
    if len(linessub) > 1: 
        results.append(pages)


curdir = os.path.dirname(os.path.realpath(__file__))
os.chdir(curdir)
files = os.listdir(curdir)
if os.environ.get('OS','') == 'Windows_NT':
    dstdir = os.path.join(curdir, '1up\\')
else:
    dstdir = os.path.join(curdir, '1up/')
if not os.path.exists(dstdir):
   os.makedirs(dstdir)
pdffiles = [f for f in files if f.endswith('.pdf')]

manager = multiprocessing.Manager()
ind = manager.list()
procs = []

for p in pdffiles: 
    pages=convert_from_path(p)  
    p = multiprocessing.Process(target=task, args=(pages, ind))
    procs.append(p)
    p.start()

for p in procs:
    p.join()


startpg = min(ind)
endpg = max(ind)
page = pages[startpg]
image = np.array(page)
height = int(math.floor(image.shape[0])/2)
width = int(math.floor(image.shape[1])/2)

导入多处理
导入时间信息
def 4up_检测（页面、结果）：
灰色=np.数组（页数）
灰色=cv2.CVT颜色（灰色，cv2.COLOR\u BGR2GRAY）
（阈值，bw）=cv2.阈值（灰色，0，255，cv2.thresh_BINARY | cv2.thresh_OTSU）
边缘=cv2.Canny（bw，0255）
minLineLength=1100
lines=cv2.HoughLinesP（图像=边缘，ρ=5，θ=np.pi/90，阈值=1000，lines=np.array（[]），minLineLength=minLineLength，maxLineGap=3）
linessub=行[（（行[：，0,0]>750）和（行[：，0,0]1000）和（行[：，0,1]1:
结果。追加（页）
curdir=os.path.dirname（os.path.realpath（_文件__））
os.chdir（curdir）
files=os.listdir（curdir）
如果os.environ.get（'os'，''）='Windows\u NT'：
dstdir=os.path.join（curdir，'1up\\\'）
其他：
dstdir=os.path.join（curdir，'1up/'）
如果操作系统路径不存在（dstdir）：
os.makedirs（dstdir）
pdffiles=[f表示文件中的f，如果f.endswith（'.pdf'）]
manager=multiprocessing.manager（）
ind=manager.list（）
过程=[]
对于PDF中的p：
pages=从路径（p）转换路径（p）
p=多处理。进程（目标=任务，参数=（页面，索引））
过程附加（p）
p、 开始（）
对于进程中的p：
p、 加入
startpg=最小值（ind）
endpg=最大值（ind）
页面=页面[startpg]
image=np.array（第页）
高度=int（数学地板（image.shape[0]）/2）
宽度=int（数学地板（image.shape[1]）/2）

由于

全局解释器锁定，Python是单线程的。

请编写一个脚本，生成多个Python解释器并合并结果。@您想在这里多处理什么？您声明要一次处理pdf的一个，并在第二个代码处处理（您已经尝试过了），页面之间没有循环，您只需将其传递给函数。

pages

是列表吗？非常感谢您的帮助。我尝试了两种方法，def 4up_detect行似乎出现语法错误：def 4up_detect（页面，结果）：语法错误：无效语法（，第1行）@语法错误是由于函数名以数字（4up）开头。当我将其重命名为（fourup）时，错误消失了。然而，现在当我使用上面的多线程方法时，似乎行：gray=np。数组（pages）不喜欢pages是一个列表。{TypeError:int（）参数必须是字符串、类似对象的字节或数字，而不是“PpmImageFile”}因为我只对Pdfiles循环中的多进程pages循环感兴趣，似乎我必须添加这一行{for I in range（len（pages））：}然后将多处理行放在下面。但是def fourup中的原始函数无法访问它正在处理的页面的索引号…我可能只是读取所有文件中的所有页面，并将它们粘贴在列表中。然后将它们发送到线程进行处理。尽量保持简单，尤其是对于de窃听-多处理不是调试的玩笑：）

import multiprocessing

def task(arg, results):
    '''worker function'''
    results.append(arg)


manager = multiprocessing.Manager()
results = manager.list()
procs = []
for i in range(10):
    p = multiprocessing.Process(target=task, args=(i,results))
    procs.append(p)
    p.start()

for i in procs:
    i.join()

print(results)

import multiprocessing
import timeit


def 4up_detect(pages, results):
    gray = np.array(pages)
    gray = cv2.cvtColor(gray, cv2.COLOR_BGR2GRAY)
    (thresh, bw) = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    edges = cv2.Canny(bw,0,255)
    minLineLength=1100
    lines = cv2.HoughLinesP(image=edges,rho=5,theta=np.pi/90, threshold=1000,lines=np.array([]), minLineLength=minLineLength,maxLineGap=3)
    linessub = lines[((lines[:,0,0]>750) & (lines[:,0,0]<950))|((lines[:,0,1]>1000) & (lines[:,0,1]<1200))]
    if len(linessub) > 1: 
        results.append(pages)


curdir = os.path.dirname(os.path.realpath(__file__))
os.chdir(curdir)
files = os.listdir(curdir)
if os.environ.get('OS','') == 'Windows_NT':
    dstdir = os.path.join(curdir, '1up\\')
else:
    dstdir = os.path.join(curdir, '1up/')
if not os.path.exists(dstdir):
   os.makedirs(dstdir)
pdffiles = [f for f in files if f.endswith('.pdf')]

manager = multiprocessing.Manager()
ind = manager.list()
procs = []

for p in pdffiles: 
    pages=convert_from_path(p)  
    p = multiprocessing.Process(target=task, args=(pages, ind))
    procs.append(p)
    p.start()

for p in procs:
    p.join()


startpg = min(ind)
endpg = max(ind)
page = pages[startpg]
image = np.array(page)
height = int(math.floor(image.shape[0])/2)
width = int(math.floor(image.shape[1])/2)