Python 如何在循环数据帧行期间使用多处理收集结果

Python 如何在循环数据帧行期间使用多处理收集结果,python,pandas,python-multiprocessing,Python,Pandas,Python Multiprocessing,单线程代码运行良好,保存了3个csvfiles并包含内容 def func(df:DataFrame, file_name:str): pbar = tqdm(len(df)) res = list() for row in df.itertuples(): for path in random.sample(path_list, 8): # DO some opration, the following code is simpl

单线程代码运行良好,保存了3个csvfiles并包含内容

def func(df:DataFrame, file_name:str):
    pbar = tqdm(len(df))
    res = list()
    for row in df.itertuples():
        for path in random.sample(path_list, 8):
            # DO some opration, the following code is simplify edition
            A = random.sample(os.listdir(row[3]), 2)
            C = "abc"
            cur_files = os.listdir(path)
            random.shuffle(cur_files)
            for each in cur_files:
                B.append(each)
                if len(B) == 2: break
            for i in range(2):
                res.append((row[2], A[i], B[i], C))

        pbar.update(1)
    
    data = pd.DataFrame(res, columns["name", "A", "B", "C"])
    data = data.sample(frac=1).reset_index(drop=True)
    data.to_csv(file_name)

if __name__ == "__main__":
    path_list = os.listdir(....)
    train_data = pd.DataFrame(...)
    val_data = pd.DataFrame(...)
    test_data = pd.DataFrame(...)
    for df, tp in [(train_data, "train.csv"), (val_data, "val.csv"), (test_data, "test.csv")]:
        func(df, tp)
我在循环三个数据集期间使用了多处理,这段代码运行良好,保存了三个csvfiles并包含内容

def func(df:DataFrame, file_name:str):
    pbar = tqdm(len(df))
    res = list()
    for row in df.itertuples():
        for path in random.sample(path_list, 8):
            # DO some opration, the following code is simplify edition
            A = random.sample(os.listdir(row[3]), 2)
            C = "abc"
            cur_files = os.listdir(path)
            random.shuffle(cur_files)
            for each in cur_files:
                B.append(each)
                if len(B) == 2: break
            for i in range(2):
                res.append((row[2], A[i], B[i], C))

        pbar.update(1)
    
    data = pd.DataFrame(res, columns["name", "A", "B", "C"])
    data = data.sample(frac=1).reset_index(drop=True)
    data.to_csv(file_name)

if __name__ == "__main__":
    path_list = os.listdir(....)
    train_data = pd.DataFrame(...)
    val_data = pd.DataFrame(...)
    test_data = pd.DataFrame(...)
    for df, tp in [(train_data, "train.csv"), (val_data, "val.csv"), (test_data, "test.csv")]:
        func(df, tp)
如果名称=“\uuuuu main\uuuuuuuu”:
path_list=os.listdir(..)
列车数据=pd.数据帧(…)
val_data=pd.DataFrame(…)
测试数据=局部数据帧(…)
cpuCount=multiprocessing.cpu\u count()
将multiprocessing.Pool(cpuCount)作为池:
对于df,tp在[(train_数据,“train.csv”),(val_数据,“val.csv”),(test_数据,“test.csv”)]中:
pool.apply_async(func,(df,tp))
pool.close()
pool.join()
打印(“子流程全部完成”)
但是速度很慢,因为train_数据有很多行,所以我想在func方法中多处理行, 以下代码不起作用,没有错误并立即完成,保存了3个CSV文件,但CSV文件内容只有标题

def func(row, pbar, res):
    for path in random.sample(path_list, 8):
        # DO some opration, the following code is simplify edition
        A = random.sample(os.listdir(row[3]), 2)
        C = "abc"
        cur_files = os.listdir(path)
        random.shuffle(cur_files)
        for each in cur_files:
            B.append(each)
            if len(B) == 2: break
        for i in range(2):
            res.append((row[2], A[i], B[i], C))

    pbar.update(1)

if __name__ == "__main__":
    path_list = os.listdir(....)
    train_data = pd.DataFrame(...)
    val_data = pd.DataFrame(...)
    test_data = pd.DataFrame(...)
    cpuCount = multiprocessing.cpu_count()
    with multiprocessing.Pool(cpuCount) as pool:
        for df, tp in [(train_data, "train.csv"), (val_data, "val.csv"), (test_data, "test.csv")]:
            pbar = tqdm(len(df))
            res = list()
            for row in df.itertuples():
                pool.apply_async(func, (row, pbar, res))
            data = pd.DataFrame(res, columns["name", "A", "B", "C"])
            data = data.sample(frac=1).reset_index(drop=True)
            data.to_csv(file_name)]
        pool.close()
        pool.join()
   print("Sub-process all done.")
        
如何优化最后的代码或更好的方法