不返回函数的Python I/O多处理_Python_Performance_Io_Python Multiprocessing

不返回函数的Python I/O多处理

python performance io

不返回函数的Python I/O多处理,python,performance,io,python-multiprocessing,Python,Performance,Io,Python Multiprocessing,我有一个可用的python脚本，它以简化的方式工作如下： open("A", 'r') open("B", 'r') open("C", 'w') for lineA in A: part1, part2, part3 = lineA.split(' ') for lineB in B: if part2 in lineB: C.write(lineB) 我想在文件B中签入文件a行的某个部分是否存在。如果是这样，将文件B中的整行写入新文件

我有一个可用的python脚本，它以简化的方式工作如下：

open("A", 'r')
open("B", 'r')
open("C", 'w')
for lineA in A:
    part1, part2, part3 = lineA.split(' ')
    for lineB in B:
        if part2 in lineB:
            C.write(lineB)

我想在文件B中签入文件a行的某个部分是否存在。如果是这样，将文件B中的整行写入新文件C

这个过程有点费时，我已经设计了它（1-i仍然认为自己是一个NoOB与Python，2至少有4个if语句运行在主for for循环），现在我已经开始使用比以前大200倍的输入文件，所以我在这里每输入文件获得大约5个小时的时间。p> 我曾尝试使用多处理，但似乎无法使其发挥作用。最初，我在main（）函数中尝试了一个简单的代码，但没有任何明显的改进，而且肯定没有使用多个CPU：

p = Process(target=multi_thread, args=(arg1,arg2,arg3))
p.start()
p.join()

然后我尝试了乔布斯的方法：

jobs = []
for i in range(4):
    p = Process(target='myfunc')
    jobs.append(p)
    p.start()
    p.join()

我在论坛中找到了一个池示例，我在其主函数中添加了一个返回语句：

def multiproc(arg1,arg2,arg3):
    (...)
    return lineB   # example of Return statment

def main():
    pool = Pool(4)
    with open('file.txt', 'w') as map_file:
        # chunk the work into batches of 4 lines at a time
        results = pool.map(multi_thread, map_file, 4)

if __name__ == "__main__":
    main()

jobs方法实际上创建了文件，然后重新启动了3次，整个过程从头开始。最后一个给了我以下错误：

io.UnsupportedOperation: not readable

我还假设我的Return语句打破了我的循环。。。对于这段代码启用多处理，或者改进其整洁性，有什么建议吗

谢谢

编辑：根据要求，以下是完整的混乱代码：

#!/usr/bin/python3
__author__ = 'daniel'

import os
import re
from multiprocessing import Process
from multiprocessing import Pool
import time
start_time = time.time()

def multi_thread(filePath, datasetFolder, mapFileDataset):
    fout = open('outdude.txt', 'w')
    cwd = os.getcwd()
    cwdgen, sep, id = filePath.rpartition('/')
    dataset = datasetFolder.rsplit("/",1)
    dataset = dataset[1]
    ## Create file
    for i in os.listdir(cwd):
        if ".ped" in i:
            sample_id, sep, rest = i.partition('.ped')
            for i in os.listdir(cwd):
                if sample_id+'.pileupgatk' in i and dataset in i:
                    pileup4map = open(i,'r')
                    snpcounter = sum(1 for _ in pileup4map)-1
                    pileup4map.seek(0)
                    mapout = open(sample_id+'.map', 'w')
                    counter = 1
                    for line in pileup4map:
                        if counter <= snpcounter:
                            mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
                            line = line.rstrip()
                            chro, coord, refb, rbase, qual = line.split(' ')
                            chrom = chro.strip("chr")
                            counter+=1
                            for ligna in mapFileData:
                                if coord in ligna:
                                    k = re.compile(r'(?=%s )' % coord, re.I)   
                                    lookAhead = k.search(ligna)
                                    k = re.compile(r'(?<= %s)' % coord, re.I)   
                                    lookBehind = k.search(ligna)
                                    if lookAhead and lookBehind != None:
                                        lignaChrom = ligna[:2].rstrip(' ')
                                        if chrom == lignaChrom:
                                            lignaOut = ligna.rstrip()
                                            mapout.write(lignaOut+'\n')
                                            ## For POOL
                                            return lignaOut
                                        else: 
                                            pass
                                    else:
                                        pass
                        else:
                            pass
                    mapout.close()


def main():
    #Multiproc
    # p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
    # p.start()
    # p.join()
    # print("--- %s seconds ---" % (time.time() - start_time))

    #Jobs
    # jobs = []
    # for i in range(4):
    #     p = Process(target=multi_thread, args=('/home/full_karyo.fa', '/home/haak15', 'dataPP.map'))
    #     jobs.append(p)
    #     p.start()
    #     p.join()

    #Pool
    pool = Pool(4)
    with open('file.txt', 'w') as map_file:
        # chunk the work into batches of 4 lines at a time
        results = pool.map(multi_thread, map_file, 4)
        print(results)
    print("--- %s seconds ---" % (time.time() - start_time))


if __name__ == "__main__":
    main()

#/usr/bin/python3
__作者：丹尼尔
导入操作系统
进口稀土
从多处理导入进程
来自多处理导入池
导入时间
开始时间=time.time（）
def多线程（文件路径、数据集文件夹、mapFileDataset）：
fout=open（'outdude.txt'，'w'）
cwd=os.getcwd（）
cwdgen，sep，id=filePath.rpartition（“/”）
dataset=datasetFolder.rsplit（“/”，1）
数据集=数据集[1]
##创建文件
对于os.listdir（cwd）中的i：
如果i中的“.ped”：
示例_id，sep，rest=i.partition（'.ped'））
对于os.listdir（cwd）中的i：
如果样本_id+“.pileupgatk”在i中，数据集在i中：
pileup4map=open（i，'r'）
snpcounter=总和（1表示堆积图中的uu）-1
pileup4map.seek（0）
mapout=open（示例_id+'.map'，w'）
计数器=1
对于pileup4map中的线：
如果您发布的是实际代码，而不是伪代码，那么if counter将对人们更有用。从您发布的代码来看，除了解析fileA中的第一行之外，其他事情似乎永远都不会起作用。如前所述，你的问题太模糊，无法在不做大量假设的情况下形成答案。此外，您提出的问题根本不适合于多处理/线程；这强化了人们对它过于模糊的看法。我已经编辑并添加了原始代码。我看着它，它是一个大烂摊子，这就是为什么我没有分享最初，但也许你们可以看看它，并立即理解。是的，我想多处理这个，因为我有8个可用的内核，这个任务只需要1个，需要5个小时才能完成。你说得对，这是一个大混乱。首先，我会将至少一些嵌入式循环拆分为函数，并从下至上开始测试。也许可以用一种更简单的方式单独测试多线程部分，以理解它，而不必考虑应用程序的复杂性。此外，考虑到你的问题（速度）可能不是CPU相关的，可能是I/O绑定。嘿，伙计们。谢谢你的评论。是的，我同意这可能与CPU无关，但硬盘负载正常。例如，必须有一种方法将文件分成4个部分，每个部分/进程使用一个CPU，并在最后连接所有内容。你们觉得怎么样？罗伯特，我会试试的，谢谢。
if makemap == True:
    ## Dictionary method - 13X faster
    for i in os.listdir(cwd):
        if ".ped" in i:
            sample_id, sep, rest = i.partition('.ped')
            for i in os.listdir(cwd):
                if sample_id+'.pileupgatk' in i and dataset in i:
                    print("\n\t> Creating MAP file from sample: "+sample_id)
                    pileup4map = open(i,'r')
                    snpcounter = sum(1 for _ in pileup4map)-1
                    pileup4map.seek(0)
                    counter = 1
                    piledic = {}
                    for line in pileup4map:
                        if counter <= snpcounter:
                            line = line.rstrip()
                            #chr21 43805965 G G G
                            chro, coord, refb, rbase, qual = line.split(' ')
                            chrom = chro.strip("chr")
                            piledic[chrom,coord]=int(counter)
                            counter += 1

                    pileup4map.close()
                    mapFileData = open(datasetFolder+'/'+mapFileDataset,'r')
                    mapDic = {}
                    counterM =1
                    for ligna in mapFileData:
                        #22 Affx-19821577     0.737773     50950707 A G
                        chroMap,ident,prob,posMap,bas1,bas2 = ligna.split()
                        mapDic[chroMap,posMap]=int(counterM)
                        counterM +=1

                    listOfmatches = []
                    for item in piledic:
                        if item in mapDic:
                            listOfmatches.append(mapDic[item])
                    listOfmatches.sort()

                    mapWrite = open(sample_id+".map", 'w')
                    mapFileData.seek(0)
                    lineCounter = 1
                    for lignagain in mapFileData:
                        if lineCounter in listOfmatches:
                            mapWrite.write(lignagain)
                        lineCounter +=1
                    mapWrite.close()
                    mapFileData.close()