如何将NCBI数据库中的条目输出到R中的表中

如何将NCBI数据库中的条目输出到R中的表中,r,bioinformatics,R,Bioinformatics,谢谢你的阅读和帮助。 我从NCBI下载了一个genebank平面文件,其中包含许多条目。我想从每个基因中提取三个条目,并将它们组成一个表。如何实现?非常感谢您。--> 我的朋友用python为我写的: ================================================================================ import os import pandas as pd from tqdm import tqdm import sys def

谢谢你的阅读和帮助。
我从NCBI下载了一个genebank平面文件,其中包含许多条目。我想从每个基因中提取三个条目,并将它们组成一个表。如何实现?非常感谢您。-->

我的朋友用python为我写的:

================================================================================

import os
import pandas as pd
from tqdm import tqdm
import sys

def search_line(gene_dict,gene_name,target,info,mode,l):

    if '/{}='.format(target) in l:
        if len(l.split('"')) == 3:
            gene_dict[gene_name][mode].append('{} = '.format(target) + l.split('"')[1].strip('\n'))
            keep_read = 0
            info = []
        else:
            info = [l.split('"')[1].strip('\n')]
            keep_read = target_list.index(target)
    else:
        if '"' in l:
            info.append(l.strip().strip('"\n'))
            if '{} = '.format(target) + ' '.join(info) not in gene_dict[gene_name][mode]:
                gene_dict[gene_name][mode].append('{} = '.format(target) + ' '.join(info))
            keep_read = 0
            info = []
        else:
            info.append(l.strip())
            keep_read = target_list.index(target)

    return gene_dict,info,keep_read

def init_frame_dict(gene_dict,ids,mode):
    frame_dict = {'gene': gene_dict[ids]['gene'], 'source': mode}
    for target in target_list[1:]:
        frame_dict[target] = ''
    return frame_dict

def gen_frame(gene_dict,flat):
    frame = []
    for ids in gene_dict.keys():
        for mode in gene_dict[ids].keys():
            if mode not in extract_list:
                continue
            # print(mode)
            data = gene_dict[ids][mode]
            frame_dict = init_frame_dict(gene_dict, ids, mode)
            for target_data in data:
                for target in target_list[1:]:
                    if '{} = '.format(target) in target_data:
                        if frame_dict[target] != '':
                            frame.append(frame_dict)
                            # print(frame_dict)
                            frame_dict = init_frame_dict(gene_dict, ids, mode)

                        frame_dict[target] = target_data.split('{} = '.format(target))[1]
            frame.append(frame_dict)

    pd.DataFrame(frame).to_csv('{}.csv'.format(flat[:-5]))

def main():
    for flat in os.listdir(path_root):
        gene_dict = {}
        if flat[-4:] != 'flat':
            continue
        with open (os.path.join(path_root,flat)) as f:
            lines = f.read()


        genes = lines.split('/gene=')
        skip = False
        for gene in tqdm(genes[1:]):
            if skip:
                break
            lines = gene.split('\n')
            gene_name = lines[0].split('"')[1]
            #init paras
            mode = 'init'
            target = 'none'
            read_mode = 0
            info = []
            #init dict
            if gene_name not in gene_dict:
                gene_dict[gene_name] = {'gene':gene_name,'mRNA':[],'ncRNA':[],'CDS':[],'misc_RNA':[],'exon':[],}
            #proc lines
            for l in lines:
                if 'ORIGIN' in l:
                    skip = True
                    break
                if '     mRNA' in l:
                    mode = 'mRNA'
                elif '     ncRNA' in l:
                    mode = 'ncRNA'
                elif '     CDS' in l:
                    mode = 'CDS'
                elif '     misc_RNA' in l:
                    mode = 'misc_RNA'
                elif '     exon' in l:
                    mode = 'exon'

                # search_line(gene_dict, gene_name, target, info, mode, l)
                if '/product=' in l and mode != 'init' or (target == 'product' and read_mode == target_list.index('product')):
                    target = 'product'
                    gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)

                if '/protein_id=' in l and mode != 'init' or (target == 'protein_id' and read_mode == target_list.index('protein_id')):
                    target = 'protein_id'
                    gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)

                if '/note=' in l and mode != 'init' or (target == 'note' and read_mode == target_list.index('note')):
                    target = 'note'
                    gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)

                if '/transcript_id=' in l and mode != 'init' or (target == 'note' and read_mode == target_list.index('transcript_id')):
                    target = 'transcript_id'
                    gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)
        gen_frame(gene_dict,flat)

if __name__ == '__main__':
    target_list = ['none', 'product', 'transcript_id','protein_id','note']
    extract_list = ['mRNA']
    path_root = 'flats'
    if not os.path.exists(path_root):
        print('Please put your flat files in flats/ directory !')
        sys.exit()
    if len(os.listdir(path_root)) == 0:
        print('No files found in flats/ directory.')
        sys.exit()
    main()

欢迎来到SO。图片不可复制,请考虑更新您的问题,包括输入/输出信息。