Python 2.7 要从fasta文件中提取dna序列，请在另一个位置使用基因ID_Python 2.7_Biopython

Python 2.7 要从fasta文件中提取dna序列，请在另一个位置使用基因ID

python-2.7

Python 2.7 要从fasta文件中提取dna序列，请在另一个位置使用基因ID,python-2.7,biopython,Python 2.7,Biopython,我创建了一个小程序，从fasta文件中提取选定的ids+序列。感兴趣的ID是包含该基因多个序列的文件名。以下是节目： import glob, sys, os from Bio import SeqIO, SearchIO from Bio.SeqRecord import SeqRecord import argparse def help_function(): print """ usage: to_extract_seq_and_id.py [-h] [-i i

我创建了一个小程序，从fasta文件中提取选定的ids+序列。感兴趣的ID是包含该基因多个序列的文件名。以下是节目：

import  glob, sys, os
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse




def help_function():
    print """
    usage: to_extract_seq_and_id.py [-h] [-i input_file:path to data]  [-r reference file: path_to_file ]
    [-o output_directory: path_to_store_new_file] """
parser = argparse.ArgumentParser()
parser.add_argument('-input_files', '-i',type=str,help='path_to_data')
parser.add_argument('-reference', '-r',type=str,help='path_to_the_fasta_reference_file')
parser.add_argument('-output_directory','-o', type=str, help='path_to_store_new_file')

opts = parser.parse_args()
#first function to check if the files exits.
def check_file_exists(filepath, file_description):
    if not os.path.exists(filepath):
        print("The " + file_description + " (" + filepath + ") does not exist")
        sys.exit(1)
    else:
        print file_description + " detected"
def record_extraction(geneID,reference,output):

    records=list(SeqIO.parse(opts.reference,'fasta'))
    new_reference=output + '/new_reference_common_genes.fa'
    output_handle=open(new_reference, 'a')
    with open (opts.reference, 'rU') as input_handle:
        for record in records:
            recordID=record.id 

            if recordID == geneID:              
                SeqIO.write(record, output_handle, 'fasta')
            else:
                continue
            return new_reference
def main():
    if len(sys.argv) <=2:
        parser.print_help()
        sys.exit()
    else:
        check_file_exists(opts.input_files, 'input_files')
        check_file_exists(opts.reference, 'reference_file')
        check_file_exists(opts.output_directory, 'output_directory')

        files=(glob.glob(opts.input_files + '/*.fa'))

        for f in files:
            database_files=glob.glob(f)[0]
            file_name=os.path.basename(f)
            gene_id=file_name.split('.')
            gene_name=gene_id[0].split('_')
            geneID=gene_name[1] + '_' + gene_name[2]
        print 'The new reference fasta file has been create'




        new_reference=record_extraction(geneID,opts.reference,opts.output_directory)
main()

导入全局、系统、操作系统
来自Bio import SeqIO、SearchIO
从Bio.SeqRecord导入SeqRecord
导入argparse
def help_函数（）：
打印“”
用法：to_extract_seq_和_id.py[-h][-i输入_文件：数据路径][-r参考文件：路径_到_文件]
[-o输出目录：存储新文件的路径]“”“
parser=argparse.ArgumentParser（）
parser.add_参数（'-input_files'，'-i'，type=str，help='path_to_data'）
parser.add_参数（'-reference'，'-r'，type=str，help='path_到_fasta_reference_文件'）
parser.add_参数（'-output_directory'，'-o'，type=str，help='path_to_store_new_file'）
opts=parser.parse_args（）
#第一个用于检查文件是否存在的函数。
def check_file_存在（文件路径、文件描述）：
如果不存在os.path（文件路径）：
打印（“文件描述+”（“+filepath+”）不存在）
系统出口（1）
其他：
打印文件\u说明+“检测到”
def记录提取（geneID、参考、输出）：
记录=列表（SeqIO.parse（opts.reference，'fasta'））
new_reference=output+'/new_reference_common_genes.fa'
输出\句柄=打开（新\引用'a'）
以open（opts.reference，'rU'）作为输入\u句柄：
记录中的记录：
recordID=record.id
如果recordID==geneID：
SeqIO.write（记录、输出\句柄'fasta'）
其他：
持续
返回新的\u引用
def main（）：
如果len（sys.argv）这个问题有点不清楚，但可能这就是您要寻找的：
results = []
for record in records:
    recordID=record.id  #.split('_')

    if recordID == geneID:
        results.append(record)
    else:
        continue           
SeqIO.write(" ".join(text for text in results), output_handle, 'fasta')
return new_reference

如果这不是你要找的。请详细说明您需要的问题和解决方案。
我遇到的问题是缩进问题。如果查看上面的代码，可以看到主函数（def main（））中的for循环中没有调用def record\u extraction（））。我已经改变了这个缩进，现在它确实工作得很好。
请参见上面的新脚本：
import  glob, sys, os
from Bio import SeqIO, SearchIO
from Bio.SeqRecord import SeqRecord
import argparse

def help_function():
    print """
    usage: to_extract_seq_and_id.py [-h] [-i input_file:path to data]  [-r reference file: path_to_file ]
    [-o output_directory: path_to_store_new_file] """
parser = argparse.ArgumentParser()
parser.add_argument('-input_files', '-i',type=str,help='path_to_data')
parser.add_argument('-reference', '-r',type=str,help='path_to_the_fasta_reference_file')
parser.add_argument('-output_directory','-o', type=str, help='path_to_store_new_file')

opts = parser.parse_args()
#first function to check if the files exits.
def check_file_exists(filepath, file_description):
    if not os.path.exists(filepath):
        print("The " + file_description + " (" + filepath + ") does not exist")
        sys.exit(1)
    else:
        print file_description + " detected"
def record_extraction(geneID,reference,output,genelist):

    records=list(SeqIO.parse(opts.reference,'fasta'))
    new_reference=output + '/new_reference_common_genes.fa'
    output_handle=open(new_reference, 'a')
    with open (opts.reference, 'rU') as input_handle:
        for record in records:
            recordid=record.id.split('_')
            recordID=recordid[0]+'_'+recordid[1]                
            if recordID in genelist: 

                SeqIO.write(record, output_handle, 'fasta')
            else:
                continue
        return new_reference    
def main():
    if len(sys.argv) <=2:
        parser.print_help()
        sys.exit()
    else:
        check_file_exists(opts.input_files, 'input_files')
        check_file_exists(opts.reference, 'reference_file')
        check_file_exists(opts.output_directory, 'output_directory')
        #run the programme
        files=(glob.glob(opts.input_files + '/*.fa'))

        for f in files:
            database_files=glob.glob(f)[0]
            file_name=os.path.basename(f)
            gene_id=file_name.split('.')
            gene_name=gene_id[0].split('_')
            geneID=gene_name[1]+'_'+gene_name[2]
            genelist=[]
            if geneID not in genelist:
                genelist.append(geneID)
            new_reference=record_extraction(geneID,opts.reference,opts.output_directory,genelist)

    print 'The new reference fasta file has been create'        

main()

导入全局、系统、操作系统
来自Bio import SeqIO、SearchIO
从Bio.SeqRecord导入SeqRecord
导入argparse
def help_函数（）：
打印“”
用法：to_extract_seq_和_id.py[-h][-i输入_文件：数据路径][-r参考文件：路径_到_文件]
[-o输出目录：存储新文件的路径]“”“
parser=argparse.ArgumentParser（）
parser.add_参数（'-input_files'，'-i'，type=str，help='path_to_data'）
parser.add_参数（'-reference'，'-r'，type=str，help='path_到_fasta_reference_文件'）
parser.add_参数（'-output_directory'，'-o'，type=str，help='path_to_store_new_file'）
opts=parser.parse_args（）
#第一个用于检查文件是否存在的函数。
def check_file_存在（文件路径、文件描述）：
如果不存在os.path（文件路径）：
打印（“文件描述+”（“+filepath+”）不存在）
系统出口（1）
其他：
打印文件\u说明+“检测到”
def记录提取（geneID、引用、输出、genelist）：
记录=列表（SeqIO.parse（opts.reference，'fasta'））
new_reference=output+'/new_reference_common_genes.fa'
输出\句柄=打开（新\引用'a'）
以open（opts.reference，'rU'）作为输入\u句柄：
记录中的记录：
recordid=record.id.split（“”）
recordID=recordID[0]+''.'+recordID[1]
如果基因列表中有recordID：
SeqIO.write（记录、输出\句柄'fasta'）
其他：
持续
返回新的\u引用
def main（）：
如果len（sys.argv）允许再现您的问题，那就好了。我们没有你的文件，所以我们怎么知道你的数据结构是什么。你甚至在发布之前都没有清理代码。