Python多线程管理器.list（），如何正确访问数据_Python_Multithreading

Python多线程管理器.list（），如何正确访问数据

python multithreading

Python多线程管理器.list（），如何正确访问数据,python,multithreading,Python,Multithreading,我正在构建一个类，该类在自己的进程中启动，并以批量大小将数据推送到数据库。此类使用Manager.list获取数据。我认为这是一种常见的模式，数据库访问在一个单独的过程中进行，但我找不到合适的库，所以我想我可以自己使用我在内部使用threading.Timer来唤醒我的数据库工作者并检查共享队列。但是，当它醒来时，队列中没有任何内容，尽管内容放在那里。我是否错误地使用了Manager.list 源代码： import random from threading import Timer imp

我正在构建一个类，该类在自己的进程中启动，并以批量大小将数据推送到数据库。此类使用Manager.list获取数据。我认为这是一种常见的模式，数据库访问在一个单独的过程中进行，但我找不到合适的库，所以我想我可以自己使用

我在内部使用threading.Timer来唤醒我的数据库工作者并检查共享队列。但是，当它醒来时，队列中没有任何内容，尽管内容放在那里。我是否错误地使用了Manager.list

源代码：

import random
from threading import Timer
import threading

from sqlalchemy import *
from multiprocessing import Process, Manager
from util.config import get_connection


def __convert_to_key(connection, table):
    return "{}.{}".format(connection.name, table.name)


class ConnectionWorker():
    __batch_size = 1000
    __batch_insert_queue = None
    __manager = Manager()
    __wait_interval = 5.0
    __finish = False
    __connection = None
    __table = None
    __timer = None
    finished = False

    def __init__(self, connection, table):
        self.__lock = threading.RLock()
        self.__connection = connection
        self.__table = table
        p = Process(target=self.__insert_data)
        p.start()

    def get_batch_insert_queue(self):
        self.__lock.acquire()
        try:
            if self.__batch_insert_queue is None:
                self.__batch_insert_queue = self.__manager.list()
            return self.__batch_insert_queue
        finally:
            self.__lock.release()

    def __insert_data(self):
        print("__insert_data, the queue is {}".format(len(self.get_batch_insert_queue())))
        q = self.get_batch_insert_queue()

        #push everything now if we have been told to finish
        if self.__finish:
            print("__finish flag has been set")
            self.__connection.execute(self.__table.insert().values(q))
            self.finished = True
            return

        #if there is nothing to do then just sleep
        if len(q) == 0:
            print("The queue is empty, sleeping")
            self.__timer = Timer(self.__wait_interval, self.__insert_data)
            self.__timer.start()
            self.__timer.join()

        values_to_insert = []
        while len(q) > 0 and len(values_to_insert) < self.__batch_size:
            values_to_insert.append(q.pop)
        print("Inserting {} values".format(len(values_to_insert)))
        self.__connection.execute(self.__table.insert().values(values_to_insert))

        #don't sleep if the queue has more work to do
        if len(q) >= self.__batch_size:
            print("Not sleeping, there is more work to be done, {} items".format(len(q)))
            self.__insert_data()
        else:
            print("Sleeping")
            self.__timer = Timer(self.__wait_interval, self.__insert_data).start()
            self.__timer.start()
            self.__timer.join()

    def finish(self):
        print("Setting finish to true")
        self.__finish = True

#test data
if __name__ == "__main__":
    #create the db and get metadata
    conn = get_connection()
    query = "DROP TABLE IF EXISTS tmp_test"
    try:
        conn.execute(query)
    except:
        pass
    query = """CREATE TABLE tmp_test (
    value bigint DEFAULT NULL
    ) ENGINE=InnoDB;"""
    conn.execute(query)

    metadata = MetaData()
    metadata.reflect(bind=conn)
    tbl = metadata.tables["tmp_test"]

    c = ConnectionWorker(conn, tbl)
    q = c.get_batch_insert_queue()
    for item in random.sample(xrange(1, 1000000000), 100000):
        q.append(item)
    print("The queue is {}".format(len(q)))
    print("The batch queue is {}".format(len(c.get_batch_insert_queue())))
    import time
    time.sleep(10)
    c.finish()

    while not c.finished:
        time.sleep(1)

第一个队列为空有助于对象初始化，但下两个队列中似乎应该包含项。我也不清楚为什么当finish对象设置为True时，worker会让它通过self。我认为应该打印finish check。finish标志已经设置

欢迎评论以及指向默认情况下可能处理所有这些问题的库的指针。

数据永远不会在进程间隐式共享。两个后果：

主程序中创建的Manager.list与工作进程中创建的Manager.list无关；以及

主程序中的self.\u finish属性与辅助进程中的self.\u finish属性无关

你真的应该后退，尝试更简单的代码，直到这些东西对你来说更有意义。做这件事的通常方法是这样的，我已经扔掉了所有的类和方法的积垢，所以可以看到这里什么是重要的。请注意，不需要额外的线程或睡眠等：

c=ConnectionWorkerconn，tbl是否启动了一个新进程，q=c.get\u batch\u insert\u队列是否为列表获取了一个代理以供主运行线程使用？无意冒犯，但您的代码太复杂了，我不想花时间来详细地解开它。相反，我为您提供了更简单的代码，这些代码实际上是有效的；-你不需要接受它。如果你想自己解开它，太好了！但是，请接受我的建议，将您的代码缩减到最基本的部分。这里发生了很多可怕的事情，比如在主程序中编译类定义时产生的副作用是创建一个管理器，而您的代码甚至不会在Windows上运行。

__insert_data, the queue is 0
The queue is empty, sleeping
The queue is 100000
The batch queue is 100000
__insert_data, the queue is 0
The queue is empty, sleeping
__insert_data, the queue is 0
The queue is empty, sleeping
Setting finish to true
__insert_data, the queue is 0
The queue is empty, sleeping

# shared data must be passed, even mp data structures
def worker(q):
    values_to_insert = []
    while True:
        item = q.get() # no need to sleep - blocks until data is ready
        if item is None:
            break
        values_to_insert.append(item)
        if len(values_to_insert) >= 39: # whatever - your `__batch_size`
            print "processing", values_to_insert
            values_to_insert = []
    # deal with any leftovers
    if values_to_insert:
        print "processing", values_to_insert

if __name__ == "__main__":
    import multiprocessing as mp
    import random

    q = mp.Queue(100)  # bounded queue
    proc = mp.Process(target=worker, args=(q,))
    proc.start()
    for item in random.sample(xrange(1, 1000000000), 100000):
        # will block if q has more than 100 items; blocking
        # waits for worker to catch up
        q.put(item)
    q.put(None)  # tell worker we're done
    proc.join()