使用python包装器并行化python脚本
我有一个python脚本使用python包装器并行化python脚本,python,parallel-processing,bioinformatics,joblib,fastq,Python,Parallel Processing,Bioinformatics,Joblib,Fastq,我有一个python脚本heavy_lifting.py,我使用从bash包装器脚本wrapper.sh调用的GNU Parallel对其进行了并行化。我使用它来处理fastq格式的文件,请参见下面的example.fastq。虽然这是可行的,但要求使用两个解释器和一组依赖项是不雅观的。我想使用python重写bash包装器脚本,同时实现相同的并行化 example.fastq这是一个需要处理的输入文件示例。此输入文件通常非常长(~500000000)行 wrapper.sh #!/bin/ba
heavy_lifting.py
,我使用从bash包装器脚本wrapper.sh
调用的GNU Parallel对其进行了并行化。我使用它来处理fastq格式的文件,请参见下面的example.fastq
。虽然这是可行的,但要求使用两个解释器和一组依赖项是不雅观的。我想使用python重写bash包装器脚本,同时实现相同的并行化
example.fastq
这是一个需要处理的输入文件示例。此输入文件通常非常长(~500000000)行
wrapper.sh
#!/bin/bash
NUMCORES="4"
FASTQ_F="./fastq_F.fastq"
# split the input fastq for parallel processing. One split fastq file will be created for each core available.
split --number="l/$NUMCORES" $FASTQ_F split_fastq_F_
# Feed split fastq files to GNU Parallel to invoke parallel executions of `heavy_lifting.py`
ls split_fastq_F* | awk -F "split_fastq_F" '{print $2}' | parallel "python heavy_lifting.py -i split_fastq_F{} -o output.fastq"
#remove intermediate split fastq files
rm split_fastq_*
要执行这些脚本,我使用命令bash wrapper.sh
。您可以看到创建了一个结果文件output.fastq
,其中包含一个修改过的fastq文件。
下面是我使用python包装器wrapper.py
调用并行处理的尝试
wrapper.py
#!/usr/bin/env python
import argparse
# Read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
args = parser.parse_args()
# Iterate through input file and append to output file
with open(args.inputFastq, "r") as infile:
with open(args.outputFastq, "a") as outfile:
for line in infile:
outfile.write("modified" + line)
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
counter += 1
elif counter <= split_size:
outfile.write(line.strip())
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list)
#!/usr/bin/env python
import argparse
# Read in arguments
#parser = argparse.ArgumentParser()
#parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
#parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
#args = parser.parse_args()
def heavy_lifting_fun(inputFastq, outputFastq):
# Iterate through input file and append to output file
outfile = open(outputFastq, "a")
with open(inputFastq, "r") as infile:
for line in infile:
outfile.write("modified" + line.strip() + "\n")
outfile.close()
if __name__ == '__main__':
heavy_lifting_fun()
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
while (split_size % 4 != 0):
split_size += 1
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
print(counter)
#if counter == 0 and line[0] != "@":
# continue
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
elif counter < split_size:
outfile.write(str(line.strip() + "\n"))
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list)
#/usr/bin/env python
进口重型起重机
从joblib并行导入,延迟
导入多处理
numcores=4
fastq\u F=“fastq\u F.fastq”
#创建一些逻辑,将输入fastq文件拆分为块,以便并行处理。
#获取输入fastq文件维度
以开放式(fastq_F,“r”)作为填充:
长度\u fastq=len(infle.readlines())
打印(长度\u fastq)
lines=infle.readlines()
分割尺寸=长度快速Q/numcores
打印(拆分大小)
#迭代输入fastq文件,将行写入BIN中的输出文件。
计数器=0
拆分计数器=0
拆分快速列表=[]
以开放式(fastq_F,“r”)作为填充:
对于填充中的线:
如果计数器==0:
filename=str(“./split\u fastq\u F\u”+str(split\u计数器))
split_fastq_list.append(文件名)
outfile=open(文件名“a”)
计数器+=1
elif计数器需要函数名,而不是文件/模块名
因此,在heavy_-lifting
中,必须将代码放入函数中(使用参数而不是args
)
然后你可以使用
Parallel(n_jobs=numcores)(delayed(heavy_lifting.my_function)(i, "output.fastq") for i in split_fastq_list)
Parallel
需要函数名,而不是文件/模块名
因此,在heavy_-lifting
中,必须将代码放入函数中(使用参数而不是args
)
然后你可以使用
Parallel(n_jobs=numcores)(delayed(heavy_lifting.my_function)(i, "output.fastq") for i in split_fastq_list)
为了再现性,我将furas提供的答案应用到heavy_-lifting.py
和wrapper.py
脚本中。需要进行额外的编辑才能使代码运行,这就是我提供以下内容的原因
重型吊装.py
#!/usr/bin/env python
import argparse
# Read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
args = parser.parse_args()
# Iterate through input file and append to output file
with open(args.inputFastq, "r") as infile:
with open(args.outputFastq, "a") as outfile:
for line in infile:
outfile.write("modified" + line)
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
counter += 1
elif counter <= split_size:
outfile.write(line.strip())
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list)
#!/usr/bin/env python
import argparse
# Read in arguments
#parser = argparse.ArgumentParser()
#parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
#parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
#args = parser.parse_args()
def heavy_lifting_fun(inputFastq, outputFastq):
# Iterate through input file and append to output file
outfile = open(outputFastq, "a")
with open(inputFastq, "r") as infile:
for line in infile:
outfile.write("modified" + line.strip() + "\n")
outfile.close()
if __name__ == '__main__':
heavy_lifting_fun()
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
while (split_size % 4 != 0):
split_size += 1
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
print(counter)
#if counter == 0 and line[0] != "@":
# continue
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
elif counter < split_size:
outfile.write(str(line.strip() + "\n"))
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list)
wrapper.py
#!/usr/bin/env python
import argparse
# Read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
args = parser.parse_args()
# Iterate through input file and append to output file
with open(args.inputFastq, "r") as infile:
with open(args.outputFastq, "a") as outfile:
for line in infile:
outfile.write("modified" + line)
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
counter += 1
elif counter <= split_size:
outfile.write(line.strip())
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list)
#!/usr/bin/env python
import argparse
# Read in arguments
#parser = argparse.ArgumentParser()
#parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
#parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
#args = parser.parse_args()
def heavy_lifting_fun(inputFastq, outputFastq):
# Iterate through input file and append to output file
outfile = open(outputFastq, "a")
with open(inputFastq, "r") as infile:
for line in infile:
outfile.write("modified" + line.strip() + "\n")
outfile.close()
if __name__ == '__main__':
heavy_lifting_fun()
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
while (split_size % 4 != 0):
split_size += 1
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
print(counter)
#if counter == 0 and line[0] != "@":
# continue
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
elif counter < split_size:
outfile.write(str(line.strip() + "\n"))
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list)
#/usr/bin/env python
进口重型起重机
从joblib并行导入,延迟
导入多处理
numcores=4
fastq\u F=“fastq\u F.fastq”
#创建一些逻辑,将输入fastq文件拆分为块,以便并行处理。
#获取输入fastq文件维度
以开放式(fastq_F,“r”)作为填充:
长度\u fastq=len(infle.readlines())
打印(长度\u fastq)
lines=infle.readlines()
分割尺寸=长度快速Q/numcores
而(拆分大小%4!=0):
拆分大小+=1
打印(拆分大小)
#迭代输入fastq文件,将行写入BIN中的输出文件。
计数器=0
拆分计数器=0
拆分快速列表=[]
以开放式(fastq_F,“r”)作为填充:
对于填充中的线:
打印(计数器)
#如果计数器==0且行[0]!="@":
#继续
如果计数器==0:
filename=str(“./split\u fastq\u F\u”+str(split\u计数器))
split_fastq_list.append(文件名)
outfile=open(文件名“a”)
outfile.write(str(line.strip()+“\n”))
计数器+=1
elif计数器<拆分大小:
outfile.write(str(line.strip()+“\n”))
计数器+=1
其他:
计数器=0
拆分计数器+=1
outfile.close()
filename=str(“./split\u fastq\u F\u”+str(split\u计数器))
split_fastq_list.append(文件名)
outfile=open(文件名“a”)
outfile.write(str(line.strip()+“\n”))
计数器+=1
outfile.close()
并行(n_jobs=numcores)(延迟(重载提升。重载提升乐趣)(i,“output.fastq”)用于拆分快速列表中的i)
对于再现性,我将furas提供的答案应用到重载提升.py
和包装器.py
脚本中。需要进行额外的编辑才能使代码运行,这就是我提供以下内容的原因
重型吊装.py
#!/usr/bin/env python
import argparse
# Read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
args = parser.parse_args()
# Iterate through input file and append to output file
with open(args.inputFastq, "r") as infile:
with open(args.outputFastq, "a") as outfile:
for line in infile:
outfile.write("modified" + line)
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
counter += 1
elif counter <= split_size:
outfile.write(line.strip())
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list)
#!/usr/bin/env python
import argparse
# Read in arguments
#parser = argparse.ArgumentParser()
#parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
#parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
#args = parser.parse_args()
def heavy_lifting_fun(inputFastq, outputFastq):
# Iterate through input file and append to output file
outfile = open(outputFastq, "a")
with open(inputFastq, "r") as infile:
for line in infile:
outfile.write("modified" + line.strip() + "\n")
outfile.close()
if __name__ == '__main__':
heavy_lifting_fun()
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
while (split_size % 4 != 0):
split_size += 1
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
print(counter)
#if counter == 0 and line[0] != "@":
# continue
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
elif counter < split_size:
outfile.write(str(line.strip() + "\n"))
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list)
wrapper.py
#!/usr/bin/env python
import argparse
# Read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
args = parser.parse_args()
# Iterate through input file and append to output file
with open(args.inputFastq, "r") as infile:
with open(args.outputFastq, "a") as outfile:
for line in infile:
outfile.write("modified" + line)
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
counter += 1
elif counter <= split_size:
outfile.write(line.strip())
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting)(i, "output.fastq") for i in split_fastq_list)
#!/usr/bin/env python
import argparse
# Read in arguments
#parser = argparse.ArgumentParser()
#parser.add_argument('-i', '--inputFastq', required=True, help='forward .fastq')
#parser.add_argument('-o', '--outputFastq', required=True, help='output .fastq')
#args = parser.parse_args()
def heavy_lifting_fun(inputFastq, outputFastq):
# Iterate through input file and append to output file
outfile = open(outputFastq, "a")
with open(inputFastq, "r") as infile:
for line in infile:
outfile.write("modified" + line.strip() + "\n")
outfile.close()
if __name__ == '__main__':
heavy_lifting_fun()
#!/usr/bin/env python
import heavy_lifting
from joblib import Parallel, delayed
import multiprocessing
numcores = 4
fastq_F = "fastq_F.fastq"
#Create some logic to split the input fastq file into chunks for parallel processing.
# Get input fastq file dimensions
with open(fastq_F, "r") as infile:
length_fastq = len(infile.readlines())
print(length_fastq)
lines = infile.readlines()
split_size = length_fastq / numcores
while (split_size % 4 != 0):
split_size += 1
print(split_size)
# Iterate through input fastq file writing lines to outfile in bins.
counter = 0
split_counter = 0
split_fastq_list = []
with open(fastq_F, "r") as infile:
for line in infile:
print(counter)
#if counter == 0 and line[0] != "@":
# continue
if counter == 0:
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
elif counter < split_size:
outfile.write(str(line.strip() + "\n"))
counter += 1
else:
counter = 0
split_counter += 1
outfile.close()
filename = str("./split_fastq_F_" + str(split_counter))
split_fastq_list.append(filename)
outfile = open(filename, "a")
outfile.write(str(line.strip() + "\n"))
counter += 1
outfile.close()
Parallel(n_jobs=numcores)(delayed(heavy_lifting.heavy_lifting_fun)(i, "output.fastq") for i in split_fastq_list)
#/usr/bin/env python
进口重型起重机
从joblib并行导入,延迟
导入多处理
numcores=4
fastq\u F=“fastq\u F.fastq”
#创建一些逻辑,将输入fastq文件拆分为块,以便并行处理。
#获取输入fastq文件维度
以开放式(fastq_F,“r”)作为填充:
长度\u fastq=len(infle.readlines())
打印(长度\u fastq)
lines=infle.readlines()
分割尺寸=长度快速Q/numcores
而(拆分大小%4!=0):
拆分大小+=1
打印(拆分大小)
#迭代输入fastq文件,将行写入BIN中的输出文件。
计数器=0
拆分计数器=0
拆分快速列表=[]
以开放式(fastq_F,“r”)作为填充:
对于填充中的线:
打印(计数器)
#如果计数器==0且行[0]!="@":
#继续
如果计数器==0:
filename=str(“./split\u fastq\u F\u”+str(split\u计数器))
split_fastq_list.append(文件名)
outfile=open(文件名“a”)
outfile.write(str(line.strip()+“\n”))
计数器+=1
elif计数器<拆分大小:
outfile.write(str(line.strip()+“\n”))
计数器+=1
其他:
计数器=0
拆分计数器+=1
outfile.close()
filename=str(“./split\u fastq\u F\u”+str(split\u计数器))
split_fastq_list.append(文件名)
输出文件