Python多处理和共享计数器_Python_Multiprocessing

Python多处理和共享计数器

python

Python多处理和共享计数器,python,multiprocessing,Python,Multiprocessing,我的多处理模块有问题。我使用一个工作池及其映射方法来加载大量文件中的数据，并使用自定义函数对每个文件中的数据进行分析。每次处理完一个文件后，我都希望更新一个计数器，以便跟踪还有多少文件需要处理。以下是示例代码： def analyze_data( args ): # do something counter += 1 print counter if __name__ == '__main__': list_of_files = os.listdir(s

我的多处理模块有问题。我使用一个工作池及其映射方法来加载大量文件中的数据，并使用自定义函数对每个文件中的数据进行分析。每次处理完一个文件后，我都希望更新一个计数器，以便跟踪还有多少文件需要处理。以下是示例代码：

def analyze_data( args ):
    # do something 
    counter += 1
    print counter


if __name__ == '__main__':

    list_of_files = os.listdir(some_directory)

    global counter
    counter = 0

    p = Pool()
    p.map(analyze_data, list_of_files)

我找不到解决方案。

问题是

计数器变量在进程之间不共享：每个单独的进程都在创建自己的本地实例并递增该实例
有关可用于在进程之间共享状态的一些技术，请参阅文档的第1部分。在您的情况下，您可能希望在工作人员之间共享一个实例
下面是示例的工作版本（带有一些虚拟输入数据）。注意，它使用全局值，我在实践中会尽量避免：
from multiprocessing import Pool, Value
from time import sleep

counter = None

def init(args):
    ''' store the counter for later use '''
    global counter
    counter = args

def analyze_data(args):
    ''' increment the global counter, do something with the input '''
    global counter
    # += operation is not atomic, so we need to get a lock:
    with counter.get_lock():
        counter.value += 1
    print counter.value
    return args * 10

if __name__ == '__main__':
    #inputs = os.listdir(some_directory)

    #
    # initialize a cross-process counter and the input lists
    #
    counter = Value('i', 0)
    inputs = [1, 2, 3, 4]

    #
    # create the pool of workers, ensuring each one receives the counter 
    # as it starts. 
    #
    p = Pool(initializer = init, initargs = (counter, ))
    i = p.map_async(analyze_data, inputs, chunksize = 1)
    i.wait()
    print i.get()

问题是，计数器
变量不能在进程之间共享：每个单独的进程都在创建自己的本地实例，并递增该实例
有关可用于在进程之间共享状态的一些技术，请参阅文档的第1部分。在您的情况下，您可能希望在工作人员之间共享一个实例
下面是示例的工作版本（带有一些虚拟输入数据）。注意，它使用全局值，我在实践中会尽量避免：
from multiprocessing import Pool, Value
from time import sleep

counter = None

def init(args):
    ''' store the counter for later use '''
    global counter
    counter = args

def analyze_data(args):
    ''' increment the global counter, do something with the input '''
    global counter
    # += operation is not atomic, so we need to get a lock:
    with counter.get_lock():
        counter.value += 1
    print counter.value
    return args * 10

if __name__ == '__main__':
    #inputs = os.listdir(some_directory)

    #
    # initialize a cross-process counter and the input lists
    #
    counter = Value('i', 0)
    inputs = [1, 2, 3, 4]

    #
    # create the pool of workers, ensuring each one receives the counter 
    # as it starts. 
    #
    p = Pool(initializer = init, initargs = (counter, ))
    i = p.map_async(analyze_data, inputs, chunksize = 1)
    i.wait()
    print i.get()

没有竞争条件错误的计数器类：
class Counter(object):
    def __init__(self):
        self.val = multiprocessing.Value('i', 0)

    def increment(self, n=1):
        with self.val.get_lock():
            self.val.value += n

    @property
    def value(self):
        return self.val.value

没有竞争条件错误的计数器类：
class Counter(object):
    def __init__(self):
        self.val = multiprocessing.Value('i', 0)

    def increment(self, n=1):
        with self.val.get_lock():
            self.val.value += n

    @property
    def value(self):
        return self.val.value

更快的计数器类，无需两次使用内置的值锁
class Counter(object):
    def __init__(self, initval=0):
        self.val = multiprocessing.RawValue('i', initval)
        self.lock = multiprocessing.Lock()

    def increment(self):
        with self.lock:
            self.val.value += 1

    @property
    def value(self):
        return self.val.value


更快的计数器类，无需使用内置的值锁两次
class Counter(object):
    def __init__(self, initval=0):
        self.val = multiprocessing.RawValue('i', initval)
        self.lock = multiprocessing.Lock()

    def increment(self):
        with self.lock:
            self.val.value += 1

    @property
    def value(self):
        return self.val.value


一个非常简单的例子，与jkp的答案不同：
from multiprocessing import Pool, Value
from time import sleep

counter = Value('i', 0)
def f(x):
    global counter
    with counter.get_lock():
        counter.value += 1
    print("counter.value:", counter.value)
    sleep(1)
    return x

with Pool(4) as p:
    r = p.map(f, range(1000*1000))

一个非常简单的例子，与jkp的答案不同：
from multiprocessing import Pool, Value
from time import sleep

counter = Value('i', 0)
def f(x):
    global counter
    with counter.get_lock():
        counter.value += 1
    print("counter.value:", counter.value)
    sleep(1)
    return x

with Pool(4) as p:
    r = p.map(f, range(1000*1000))

我正在PyQT5中处理一个进程条，所以我将线程和池一起使用
import threading
import multiprocessing as mp
from queue import Queue

def multi(x):
    return x*x

def pooler(q):
    with mp.Pool() as pool:
    count = 0
    for i in pool.imap_unordered(ggg, range(100)):
        print(count, i)
        count += 1
        q.put(count)

def main():
    q = Queue()
    t = threading.Thread(target=thr, args=(q,))
    t.start()
    print('start')
    process = 0
    while process < 100:
        process = q.get()
        print('p',process)
if __name__ == '__main__':
    main()

导入线程
将多处理作为mp导入
从队列导入队列
def多（x）：
返回x*x
def池器（q）：
使用mp.Pool（）作为池：
计数=0
对于池中的i.imap_无序（ggg，范围（100））：
打印（计数，i）
计数+=1
q、 放（数）
def main（）：
q=队列（）
线程（target=thr，args=（q，））
t、 开始（）
打印（'开始'）
进程=0
当工艺<100时：
进程=q.get（）
打印（'p'，过程）
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
main（）

这是我在Qthread worker中添加的，它可以在可接受的延迟下工作
我在PyQT5中的进程栏上工作，所以我将线程和池一起使用
import threading
import multiprocessing as mp
from queue import Queue

def multi(x):
    return x*x

def pooler(q):
    with mp.Pool() as pool:
    count = 0
    for i in pool.imap_unordered(ggg, range(100)):
        print(count, i)
        count += 1
        q.put(count)

def main():
    q = Queue()
    t = threading.Thread(target=thr, args=(q,))
    t.start()
    print('start')
    process = 0
    while process < 100:
        process = q.get()
        print('p',process)
if __name__ == '__main__':
    main()

导入线程
将多处理作为mp导入
从队列导入队列
def多（x）：
返回x*x
def池器（q）：
使用mp.Pool（）作为池：
计数=0
对于池中的i.imap_无序（ggg，范围（100））：
打印（计数，i）
计数+=1
q、 放（数）
def main（）：
q=队列（）
线程（target=thr，args=（q，））
t、 开始（）
打印（'开始'）
进程=0
当工艺<100时：
进程=q.get（）
打印（'p'，过程）
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
main（）

这是我在Qthread worker中输入的，它可以在可接受的延迟下工作
@jkp，如果没有全局变量，您会怎么做我正在尝试使用一个类，但它不像看起来那么容易。不幸的是，这个例子似乎有缺陷，因为counter.value+=1
在进程之间不是原子的，所以如果在几个进程中运行足够长的时间，那么该值将是错误的。按照Eli所说的，在counter value+=1
语句周围必须有一个锁。请注意，它应该是带有counter.get_lock（）
，而不是带有counter.value.get_lock（）：

@jkp，正如@Jinghao shi所说，

counter.value.get_lock（）

将生成

AttributeError:'int'对象没有属性“get_lock”

@jkp，如果没有全局变量，您将如何操作？-我正在尝试使用一个类，但它不像看起来那么容易。不幸的是，这个例子似乎有缺陷，因为

counter.value+=1

在进程之间不是原子的，所以如果在几个进程中运行足够长的时间，那么该值将是错误的。按照Eli所说的，在

counter value+=1

语句周围必须有一个

锁。请注意，它应该是带有counter.get_lock（）

，而不是带有counter.value.get_lock（）：@jkp，正如@Jinghao shi所说，

counter.value.get_lock（）

将生成

AttributeError:'int'对象对于使用joblib
s并行的类似代码没有属性“get_lock”
（此答案中的代码不适用于joblib
），请参阅我还将return self
添加到increment
函数中，以启用与joblib
sParallel
一起使用的类似代码的链接（此答案中的代码不适用于joblib
），请参见，我还将return self
添加到increment
函数以启用链接