使用python包装器并行化python脚本_Python_Parallel Processing_Bioinformatics_Joblib_Fastq

使用python包装器并行化python脚本

python parallel-processing

使用python包装器并行化python脚本,python,parallel-processing,bioinformatics,joblib,fastq,Python,Parallel Processing,Bioinformatics,Joblib,Fastq,我有一个python脚本heavy_lifting.py，我使用从bash包装器脚本wrapper.sh调用的GNU Parallel对其进行了并行化。我使用它来处理fastq格式的文件，请参见下面的example.fastq。虽然这是可行的，但要求使用两个解释器和一组依赖项是不雅观的。我想使用python重写bash包装器脚本，同时实现相同的并行化 example.fastq这是一个需要处理的输入文件示例。此输入文件通常非常长（~500000000）行 wrapper.sh #!/bin/ba

我有一个python脚本

heavy_lifting.py

，我使用从bash包装器脚本

wrapper.sh

调用的GNU Parallel对其进行了并行化。我使用它来处理fastq格式的文件，请参见下面的

example.fastq

。虽然这是可行的，但要求使用两个解释器和一组依赖项是不雅观的。我想使用python重写bash包装器脚本，同时实现相同的并行化

example.fastq

这是一个需要处理的输入文件示例。此输入文件通常非常长（~500000000）行

wrapper.sh

#!/bin/bash

NUMCORES="4"
FASTQ_F="./fastq_F.fastq"

# split the input fastq for parallel processing. One split fastq file will be created for     each core available.
split --number="l/$NUMCORES" $FASTQ_F split_fastq_F_

# Feed split fastq files to GNU Parallel to invoke parallel executions of `heavy_lifting.py`
ls split_fastq_F* | awk -F "split_fastq_F" '{print $2}' | parallel "python  heavy_lifting.py -i split_fastq_F{} -o output.fastq"

#remove intermediate split fastq files
rm split_fastq_*

要执行这些脚本，我使用命令
bash wrapper.sh
。您可以看到创建了一个结果文件
output.fastq
，其中包含一个修改过的fastq文件。

下面是我使用python包装器
wrapper.py
调用并行处理的尝试

wrapper.py

#!/usr/bin/env python import argparse # Read in arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') args = parser.parse_args() # Iterate through input file and append to output file with open(args.inputFastq, "r") as infile: with open(args.outputFastq, "a") as outfile: for line in infile: outfile.write("modified" + line)

#!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") counter += 1 elif counter <= split_size: outfile.write(line.strip()) counter += 1 else: counter = 0 split_counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list)

#!/usr/bin/env python import argparse # Read in arguments #parser = argparse.ArgumentParser() #parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') #parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') #args = parser.parse_args() def heavy_lifting_fun(inputFastq, outputFastq): # Iterate through input file and append to output file outfile = open(outputFastq, "a") with open(inputFastq, "r") as infile: for line in infile: outfile.write("modified" + line.strip() + "\n") outfile.close() if __name__ == '__main__': heavy_lifting_fun()

#!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores while (split_size % 4 != 0): split_size += 1 print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: print(counter) #if counter == 0 and line[0] != "@": # continue if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 elif counter < split_size: outfile.write(str(line.strip() + "\n")) counter += 1 else: counter = 0 split_counter += 1 outfile.close() filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list)

#/usr/bin/env python 进口重型起重机从joblib并行导入，延迟导入多处理 numcores=4 fastq\u F=“fastq\u F.fastq” #创建一些逻辑，将输入fastq文件拆分为块，以便并行处理。 #获取输入fastq文件维度以开放式（fastq_F，“r”）作为填充：长度\u fastq=len（infle.readlines（））打印（长度\u fastq） lines=infle.readlines（）分割尺寸=长度快速Q/numcores 打印（拆分大小） #迭代输入fastq文件，将行写入BIN中的输出文件。计数器=0 拆分计数器=0 拆分快速列表=[] 以开放式（fastq_F，“r”）作为填充：对于填充中的线：如果计数器==0： filename=str（“./split\u fastq\u F\u”+str（split\u计数器）） split_fastq_list.append（文件名） outfile=open（文件名“a”）计数器+=1 elif计数器需要函数名，而不是文件/模块名因此，在heavy_-lifting 中，必须将代码放入函数中（使用参数而不是args ）然后你可以使用 Parallel(n_jobs=numcores)(delayed(heavy_lifting.my_function)(i, "output.fastq") for i in split_fastq_list) Parallel 需要函数名，而不是文件/模块名因此，在heavy_-lifting 中，必须将代码放入函数中（使用参数而不是args ）然后你可以使用 Parallel(n_jobs=numcores)(delayed(heavy_lifting.my_function)(i, "output.fastq") for i in split_fastq_list) 为了再现性，我将furas提供的答案应用到heavy_-lifting.py 和wrapper.py 脚本中。需要进行额外的编辑才能使代码运行，这就是我提供以下内容的原因重型吊装.py #!/usr/bin/env python import argparse # Read in arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') args = parser.parse_args() # Iterate through input file and append to output file with open(args.inputFastq, "r") as infile: with open(args.outputFastq, "a") as outfile: for line in infile: outfile.write("modified" + line) #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") counter += 1 elif counter <= split_size: outfile.write(line.strip()) counter += 1 else: counter = 0 split_counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list) #!/usr/bin/env python import argparse # Read in arguments #parser = argparse.ArgumentParser() #parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') #parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') #args = parser.parse_args() def heavy_lifting_fun(inputFastq, outputFastq): # Iterate through input file and append to output file outfile = open(outputFastq, "a") with open(inputFastq, "r") as infile: for line in infile: outfile.write("modified" + line.strip() + "\n") outfile.close() if __name__ == '__main__': heavy_lifting_fun() #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores while (split_size % 4 != 0): split_size += 1 print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: print(counter) #if counter == 0 and line[0] != "@": # continue if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 elif counter < split_size: outfile.write(str(line.strip() + "\n")) counter += 1 else: counter = 0 split_counter += 1 outfile.close() filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list) wrapper.py #!/usr/bin/env python import argparse # Read in arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') args = parser.parse_args() # Iterate through input file and append to output file with open(args.inputFastq, "r") as infile: with open(args.outputFastq, "a") as outfile: for line in infile: outfile.write("modified" + line) #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") counter += 1 elif counter <= split_size: outfile.write(line.strip()) counter += 1 else: counter = 0 split_counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list) #!/usr/bin/env python import argparse # Read in arguments #parser = argparse.ArgumentParser() #parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') #parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') #args = parser.parse_args() def heavy_lifting_fun(inputFastq, outputFastq): # Iterate through input file and append to output file outfile = open(outputFastq, "a") with open(inputFastq, "r") as infile: for line in infile: outfile.write("modified" + line.strip() + "\n") outfile.close() if __name__ == '__main__': heavy_lifting_fun() #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores while (split_size % 4 != 0): split_size += 1 print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: print(counter) #if counter == 0 and line[0] != "@": # continue if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 elif counter < split_size: outfile.write(str(line.strip() + "\n")) counter += 1 else: counter = 0 split_counter += 1 outfile.close() filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list) #/usr/bin/env python 进口重型起重机从joblib并行导入，延迟导入多处理 numcores=4 fastq\u F=“fastq\u F.fastq” #创建一些逻辑，将输入fastq文件拆分为块，以便并行处理。 #获取输入fastq文件维度以开放式（fastq_F，“r”）作为填充：长度\u fastq=len（infle.readlines（））打印（长度\u fastq） lines=infle.readlines（）分割尺寸=长度快速Q/numcores 而（拆分大小%4！=0）：拆分大小+=1 打印（拆分大小） #迭代输入fastq文件，将行写入BIN中的输出文件。计数器=0 拆分计数器=0 拆分快速列表=[] 以开放式（fastq_F，“r”）作为填充：对于填充中的线：打印（计数器） #如果计数器==0且行[0]！="@": #继续如果计数器==0： filename=str（“./split\u fastq\u F\u”+str（split\u计数器）） split_fastq_list.append（文件名） outfile=open（文件名“a”） outfile.write（str（line.strip（）+“\n”））计数器+=1 elif计数器<拆分大小： outfile.write（str（line.strip（）+“\n”））计数器+=1 其他：计数器=0 拆分计数器+=1 outfile.close（） filename=str（“./split\u fastq\u F\u”+str（split\u计数器）） split_fastq_list.append（文件名） outfile=open（文件名“a”） outfile.write（str（line.strip（）+“\n”））计数器+=1 outfile.close（）并行（n_jobs=numcores）（延迟（重载提升。重载提升乐趣）（i，“output.fastq”）用于拆分快速列表中的i）对于再现性，我将furas提供的答案应用到重载提升.py 和包装器.py 脚本中。需要进行额外的编辑才能使代码运行，这就是我提供以下内容的原因重型吊装.py #!/usr/bin/env python import argparse # Read in arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') args = parser.parse_args() # Iterate through input file and append to output file with open(args.inputFastq, "r") as infile: with open(args.outputFastq, "a") as outfile: for line in infile: outfile.write("modified" + line) #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") counter += 1 elif counter <= split_size: outfile.write(line.strip()) counter += 1 else: counter = 0 split_counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list) #!/usr/bin/env python import argparse # Read in arguments #parser = argparse.ArgumentParser() #parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') #parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') #args = parser.parse_args() def heavy_lifting_fun(inputFastq, outputFastq): # Iterate through input file and append to output file outfile = open(outputFastq, "a") with open(inputFastq, "r") as infile: for line in infile: outfile.write("modified" + line.strip() + "\n") outfile.close() if __name__ == '__main__': heavy_lifting_fun() #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores while (split_size % 4 != 0): split_size += 1 print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: print(counter) #if counter == 0 and line[0] != "@": # continue if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 elif counter < split_size: outfile.write(str(line.strip() + "\n")) counter += 1 else: counter = 0 split_counter += 1 outfile.close() filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list) wrapper.py #!/usr/bin/env python import argparse # Read in arguments parser = argparse.ArgumentParser() parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') args = parser.parse_args() # Iterate through input file and append to output file with open(args.inputFastq, "r") as infile: with open(args.outputFastq, "a") as outfile: for line in infile: outfile.write("modified" + line) #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") counter += 1 elif counter <= split_size: outfile.write(line.strip()) counter += 1 else: counter = 0 split_counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list) #!/usr/bin/env python import argparse # Read in arguments #parser = argparse.ArgumentParser() #parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq') #parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq') #args = parser.parse_args() def heavy_lifting_fun(inputFastq, outputFastq): # Iterate through input file and append to output file outfile = open(outputFastq, "a") with open(inputFastq, "r") as infile: for line in infile: outfile.write("modified" + line.strip() + "\n") outfile.close() if __name__ == '__main__': heavy_lifting_fun() #!/usr/bin/env python import heavy_lifting from joblib import Parallel, delayed import multiprocessing numcores = 4 fastq_F = "fastq_F.fastq" #Create some logic to split the input fastq file into chunks for parallel processing. # Get input fastq file dimensions with open(fastq_F, "r") as infile: length_fastq = len(infile.readlines()) print(length_fastq) lines = infile.readlines() split_size = length_fastq / numcores while (split_size % 4 != 0): split_size += 1 print(split_size) # Iterate through input fastq file writing lines to outfile in bins. counter = 0 split_counter = 0 split_fastq_list = [] with open(fastq_F, "r") as infile: for line in infile: print(counter) #if counter == 0 and line[0] != "@": # continue if counter == 0: filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 elif counter < split_size: outfile.write(str(line.strip() + "\n")) counter += 1 else: counter = 0 split_counter += 1 outfile.close() filename = str("./split_fastq_F_" + str(split_counter)) split_fastq_list.append(filename) outfile = open(filename, "a") outfile.write(str(line.strip() + "\n")) counter += 1 outfile.close() Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list) #/usr/bin/env python 进口重型起重机从joblib并行导入，延迟导入多处理 numcores=4 fastq\u F=“fastq\u F.fastq” #创建一些逻辑，将输入fastq文件拆分为块，以便并行处理。 #获取输入fastq文件维度以开放式（fastq_F，“r”）作为填充：长度\u fastq=len（infle.readlines（））打印（长度\u fastq） lines=infle.readlines（）分割尺寸=长度快速Q/numcores 而（拆分大小%4！=0）：拆分大小+=1 打印（拆分大小） #迭代输入fastq文件，将行写入BIN中的输出文件。计数器=0 拆分计数器=0 拆分快速列表=[] 以开放式（fastq_F，“r”）作为填充：对于填充中的线：打印（计数器） #如果计数器==0且行[0]！="@": #继续如果计数器==0： filename=str（“./split\u fastq\u F\u”+str（split\u计数器）） split_fastq_list.append（文件名） outfile=open（文件名“a”） outfile.write（str（line.strip（）+“\n”））计数器+=1 elif计数器<拆分大小： outfile.write（str（line.strip（）+“\n”））计数器+=1 其他：计数器=0 拆分计数器+=1 outfile.close（） filename=str（“./split\u fastq\u F\u”+str（split\u计数器）） split_fastq_list.append（文件名）输出文件