如何将NCBI数据库中的条目输出到R中的表中
谢谢你的阅读和帮助。如何将NCBI数据库中的条目输出到R中的表中,r,bioinformatics,R,Bioinformatics,谢谢你的阅读和帮助。 我从NCBI下载了一个genebank平面文件,其中包含许多条目。我想从每个基因中提取三个条目,并将它们组成一个表。如何实现?非常感谢您。--> 我的朋友用python为我写的: ================================================================================ import os import pandas as pd from tqdm import tqdm import sys def
我从NCBI下载了一个genebank平面文件,其中包含许多条目。我想从每个基因中提取三个条目,并将它们组成一个表。如何实现?非常感谢您。--> 我的朋友用python为我写的: ================================================================================
import os
import pandas as pd
from tqdm import tqdm
import sys
def search_line(gene_dict,gene_name,target,info,mode,l):
if '/{}='.format(target) in l:
if len(l.split('"')) == 3:
gene_dict[gene_name][mode].append('{} = '.format(target) + l.split('"')[1].strip('\n'))
keep_read = 0
info = []
else:
info = [l.split('"')[1].strip('\n')]
keep_read = target_list.index(target)
else:
if '"' in l:
info.append(l.strip().strip('"\n'))
if '{} = '.format(target) + ' '.join(info) not in gene_dict[gene_name][mode]:
gene_dict[gene_name][mode].append('{} = '.format(target) + ' '.join(info))
keep_read = 0
info = []
else:
info.append(l.strip())
keep_read = target_list.index(target)
return gene_dict,info,keep_read
def init_frame_dict(gene_dict,ids,mode):
frame_dict = {'gene': gene_dict[ids]['gene'], 'source': mode}
for target in target_list[1:]:
frame_dict[target] = ''
return frame_dict
def gen_frame(gene_dict,flat):
frame = []
for ids in gene_dict.keys():
for mode in gene_dict[ids].keys():
if mode not in extract_list:
continue
# print(mode)
data = gene_dict[ids][mode]
frame_dict = init_frame_dict(gene_dict, ids, mode)
for target_data in data:
for target in target_list[1:]:
if '{} = '.format(target) in target_data:
if frame_dict[target] != '':
frame.append(frame_dict)
# print(frame_dict)
frame_dict = init_frame_dict(gene_dict, ids, mode)
frame_dict[target] = target_data.split('{} = '.format(target))[1]
frame.append(frame_dict)
pd.DataFrame(frame).to_csv('{}.csv'.format(flat[:-5]))
def main():
for flat in os.listdir(path_root):
gene_dict = {}
if flat[-4:] != 'flat':
continue
with open (os.path.join(path_root,flat)) as f:
lines = f.read()
genes = lines.split('/gene=')
skip = False
for gene in tqdm(genes[1:]):
if skip:
break
lines = gene.split('\n')
gene_name = lines[0].split('"')[1]
#init paras
mode = 'init'
target = 'none'
read_mode = 0
info = []
#init dict
if gene_name not in gene_dict:
gene_dict[gene_name] = {'gene':gene_name,'mRNA':[],'ncRNA':[],'CDS':[],'misc_RNA':[],'exon':[],}
#proc lines
for l in lines:
if 'ORIGIN' in l:
skip = True
break
if ' mRNA' in l:
mode = 'mRNA'
elif ' ncRNA' in l:
mode = 'ncRNA'
elif ' CDS' in l:
mode = 'CDS'
elif ' misc_RNA' in l:
mode = 'misc_RNA'
elif ' exon' in l:
mode = 'exon'
# search_line(gene_dict, gene_name, target, info, mode, l)
if '/product=' in l and mode != 'init' or (target == 'product' and read_mode == target_list.index('product')):
target = 'product'
gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)
if '/protein_id=' in l and mode != 'init' or (target == 'protein_id' and read_mode == target_list.index('protein_id')):
target = 'protein_id'
gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)
if '/note=' in l and mode != 'init' or (target == 'note' and read_mode == target_list.index('note')):
target = 'note'
gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)
if '/transcript_id=' in l and mode != 'init' or (target == 'note' and read_mode == target_list.index('transcript_id')):
target = 'transcript_id'
gene_dict,info,read_mode = search_line(gene_dict, gene_name, target, info, mode, l)
gen_frame(gene_dict,flat)
if __name__ == '__main__':
target_list = ['none', 'product', 'transcript_id','protein_id','note']
extract_list = ['mRNA']
path_root = 'flats'
if not os.path.exists(path_root):
print('Please put your flat files in flats/ directory !')
sys.exit()
if len(os.listdir(path_root)) == 0:
print('No files found in flats/ directory.')
sys.exit()
main()
欢迎来到SO。图片不可复制,请考虑更新您的问题,包括输入/输出信息。