Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/330.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 反转录_Python_Genetic - Fatal编程技术网

Python 反转录

Python 反转录,python,genetic,Python,Genetic,目的是在给定完整的mRNA序列和氨基酸序列的情况下,得到mRNA的编码序列。然后把所有这些都放在密码子格式中。 我觉得我已经找到了可能的密码子列表。我只是不知道如何系统地匹配给定的mRNA序列。这就是我到目前为止的情况 xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP' xmRNA = 'GUUCCCCG

目的是在给定完整的mRNA序列和氨基酸序列的情况下,得到mRNA的编码序列。然后把所有这些都放在密码子格式中。 我觉得我已经找到了可能的密码子列表。我只是不知道如何系统地匹配给定的mRNA序列。这就是我到目前为止的情况

xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'

xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'

d = {'mRNA': ['UUU','UUC','UUA','UUG','UCU','UCC','UCA','UCG','UAU','UAC','UAA','UAG','UGU','UGC','UGA','UGG','CUU','CUC','CUA','CUG','CCU','CCC','CCA','CCG','CAU','CAC','CAA','CAG','CGU','CGC','CGA','CGG','AUU','AUC','AUA','AUG','ACU','ACC','ACA','ACG','AAU','AAC','AAA','AAG','AGU','AGC','AGA','AGG','GUU','GUC','GUA','GUG','GCU','GCC','GCA','GCG','GAU','GAC','GAA','GAG','GGU','GGC','GGA','GGG'], 'AA': ['F','F','L','L','S','S','S','S','Y','Y','_','_','C','C','_','W','L','L','L','L','P','P','P','P','H','H','Q','Q','R','R','R','R','I','I','M','M','T','T','T','T','N','N','K','K','S','S','R','R','V','V','V','V','A','A','A','A','D','D','E','E','G','G','G','G']}

AA= pandas.DataFrame(data=d)
for i in xAA:
    codons = list(AA.mRNA.loc[AA['AA'] == i])
    print codons
这是输出:

['AUA', 'AUG']
['GAU', 'GAC']
['UUU', 'UUC']
['UUU', 'UUC']
['GCU', 'GCC', 'GCA', 'GCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CCU', 'CCC', 'CCA', 'CCG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GUU', 'GUC', 'GUA', 'GUG']
['ACU', 'ACC', 'ACA', 'ACG']
['GAA', 'GAG']
['GAA', 'GAG']
['ACU', 'ACC', 'ACA', 'ACG']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GGU', 'GGC', 'GGA', 'GGG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAA', 'GAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GUU', 'GUC', 'GUA', 'GUG']
['GCU', 'GCC', 'GCA', 'GCG']
['AUA', 'AUG']
['AUU', 'AUC']
['AAA', 'AAG']
['GAA', 'GAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['GAU', 'GAC']
['ACU', 'ACC', 'ACA', 'ACG']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['AUU', 'AUC']
['CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG']
['CCU', 'CCC', 'CCA', 'CCG']
['ACU', 'ACC', 'ACA', 'ACG']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['GGU', 'GGC', 'GGA', 'GGG']
['GAU', 'GAC']
['GUU', 'GUC', 'GUA', 'GUG']
['AUU', 'AUC']
['UAU', 'UAC']
['AAA', 'AAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UUU', 'UUC']
['GAA', 'GAG']
['GAU', 'GAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['GUU', 'GUC', 'GUA', 'GUG']
['CAA', 'CAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['GGU', 'GGC', 'GGA', 'GGG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['ACU', 'ACC', 'ACA', 'ACG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UGU', 'UGC']
['CCU', 'CCC', 'CCA', 'CCG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['AUU', 'AUC']
['AUU', 'AUC']
['ACU', 'ACC', 'ACA', 'ACG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['AAA', 'AAG']
['AAU', 'AAC']
['GGU', 'GGC', 'GGA', 'GGG']
['AUU', 'AUC']
['CAA', 'CAG']
['AAU', 'AAC']
['AUA', 'AUG']
['UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG']
['CAA', 'CAG']
['UUU', 'UUC']
['UAU', 'UAC']
['AUU', 'AUC']
['CCU', 'CCC', 'CCA', 'CCG']
['GAA', 'GAG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['GGU', 'GGC', 'GGA', 'GGG']
['GUU', 'GUC', 'GUA', 'GUG']
['GAA', 'GAG']
['CAA', 'CAG']
['GUU', 'GUC', 'GUA', 'GUG']
['AUA', 'AUG']
['GAU', 'GAC']
['GAU', 'GAC']
['GAA', 'GAG']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['GAU', 'GAC']
['GAA', 'GAG']
['AAA', 'AAG']
['GAA', 'GAG']
['GCU', 'GCC', 'GCA', 'GCG']
['AAU', 'AAC']
['UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC']
['CCU', 'CCC', 'CCA', 'CCG']
如果我添加这里显示的for c循环,我得到

codingseq = ""
for i in xAA:
    codons = list(AA.mRNA.loc[AA['AA'] == i])
    for c in codons:
        xmRNA.find(c)
        codingseq+= c
这就给了每个组合一个比较分析的方法,来找出哪一个最像完整的mRNA序列

AUAAUG
AUAAUGGAUGAC
AUAAUGGAUGACUUUUUC
AUAAUGGAUGACUUUUUCUUUUUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUG
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUAC
AUAAUGGAUGACUUUUUCUUUUUCGCUGCCGCAGCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGUUAUUGCUUCUCCUACUGCCUCCCCCACCGUUAUUGCUUCUCCUACUGGUUGUCGUAGUGACUACCACAACGGAAGAGGAAGAGACUACCACAACGCCUCCCCCACCGUCUUCCUCAUCGAGUAGCGGUGGCGGAGGGGAAGAGGCUGCCGCAGCGGGUGGCGGAGGGUCUUCCUCAUCGAGUAGCGAAGAGGAAGAGGAUGACGAUGACGAAGAGGUUGUCGUAGUGGUUGUCGUAGUGGCUGCCGCAGCGAUAAUGAUUAUCAAAAAGGAAGAGUUAUUGCUUCUCCUACUGUUAUUGCUUCUCCUACUGGAUGACACUACCACAACGCGUCGCCGACGGAGAAGGAUUAUCCGUCGCCGACGGAGAAGGCCUCCCCCACCGACUACCACAACGGUUGUCGUAGUGCAACAGGAAGAGGAUGACGGUGGCGGAGGGGGUGGCGGAGGGGAUGACGUUGUCGUAGUGAUUAUCUAUUACAAAAAG
请注意,并非所有显示的结果都超出了字符限制。
这方面的任何帮助都会很棒

这在xmRNA中找到一个密码子序列,它与xAA匹配。注:d[“AA”]数据在索引34处进行了更正(“M”替换为“I”),以匹配现场使用的翻译。我没有使用熊猫,只是简单的Python。我只是做了一个简单的测试,试图找到xmRNA中xAA的位置(以及使用了什么密码子)。即使是很长的序列,它也应该足够快(即使100000个RNA也应该是瞬间的)


这是一个相当紧凑的解决方案。创建反向词典时不会发生任何冲突,因此可以执行以下操作:

codon_dict = {'F': ('UUU', 'UUC'),
              'L': ('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'),
              'S': ('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'),
              'Y': ('UAU', 'UAC'),
              '_': ('UAA', 'UAG', 'UGA'),
              'C': ('UGU', 'UGC'),
              'W': ('UGG',),
              'P': ('CCU', 'CCC', 'CCA', 'CCG'),
              'H': ('CAU', 'CAC'),
              'Q': ('CAA', 'CAG'),
              'R': ('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'),
              'I': ('AUU', 'AUC', 'AUA'),
              'M': ('AUG',),
              'T': ('ACU', 'ACC', 'ACA', 'ACG'),
              'N': ('AAU', 'AAC'),
              'K': ('AAA', 'AAG'),
              'V': ('GUU', 'GUC', 'GUA', 'GUG'),
              'A': ('GCU', 'GCC', 'GCA', 'GCG'),
              'D': ('GAU', 'GAC'),
              'E': ('GAA', 'GAG'),
              'G': ('GGU', 'GGC', 'GGA', 'GGG')
             }

rna_dict = {}
for k, v in codon_dict.items():
    for val in v:
        rna_dict[val] = k

xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'

mapped = [rna_dict[x] for x in [xmRNA[i:i+3] for i in range(0, len(xmRNA)-1, 3)]]

xmRNA_index = "".join(mapped).find(xAA) * 3
print(xmRNA_index)

这将返回486,这是xAA密码子所在的位置。它可以通过对字符串进行边界检查来进一步充实,如果您只需要最接近的匹配,您可以对子字符串进行组合匹配(我仍然会进行反向映射,然后在该空间中搜索,虽然速度更快),但是是的。我也不知道没有映射的丢失字符/三元组有多常见,因此可能需要对这些情况进行修改。

目标不是很明确。我的猜测是d['mRNA']和d['AA'](如果成对)代表密码子和AAs之间的映射(可能有多个密码子编码相同的AA)。xmRNA和xAA之间的关系还不清楚——是不是在xmRNA的某个地方有一个已知的密码子序列与整个xAA相匹配?(你只需要找出它在哪里)密码子排序重要吗?AUAAUG-GAUGAC有效,但GAUGAC-AUAAUG无效吗?@LeoK xAA是xmRNA翻译时产生的氨基酸序列(该过程是生物过程),但该过程已被映射,这意味着当我们看到一个mRNA序列时,我们可以将其分为3组(密码子),然后将其翻译为蛋白质。然而,在某些情况下,一个AA(氨基酸)可能有多个密码子。数据帧和for循环查看xmRNA并找到所有这些可能性。例如,第一个输出中的第1行显示AUA和AUG,这两个都生成M(蛋氨酸),但重要的是要知道哪一个是真正的used@Saedeas是的,从另一个角度看,顺序很重要。你在制造一种完全不同的蛋白质。最后一个问题。为了使后面的字符串相关,是否必须显示前面的字符串?我是在检查输出的每个可能的子序列,还是AUAAUG总是必须作为开始?也就是说,如果我发现一系列密码子出现在输出的末尾(比如一行中的最后20个密码子或类似的东西),这是否有意义?你可以用:
r2a=dict(zip(d['mRNA',d['AA'))
@rioV8:实际上,我会更进一步,将输出repr(zip(d['mRNA',d['AA'))作为映射数据放在源代码中(密码子的元组,AA)),而不是两个数组。这将使什么映射到什么更清晰。
codon_dict = {'F': ('UUU', 'UUC'),
              'L': ('UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG'),
              'S': ('UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC'),
              'Y': ('UAU', 'UAC'),
              '_': ('UAA', 'UAG', 'UGA'),
              'C': ('UGU', 'UGC'),
              'W': ('UGG',),
              'P': ('CCU', 'CCC', 'CCA', 'CCG'),
              'H': ('CAU', 'CAC'),
              'Q': ('CAA', 'CAG'),
              'R': ('CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'),
              'I': ('AUU', 'AUC', 'AUA'),
              'M': ('AUG',),
              'T': ('ACU', 'ACC', 'ACA', 'ACG'),
              'N': ('AAU', 'AAC'),
              'K': ('AAA', 'AAG'),
              'V': ('GUU', 'GUC', 'GUA', 'GUG'),
              'A': ('GCU', 'GCC', 'GCA', 'GCG'),
              'D': ('GAU', 'GAC'),
              'E': ('GAA', 'GAG'),
              'G': ('GGU', 'GGC', 'GGA', 'GGG')
             }

rna_dict = {}
for k, v in codon_dict.items():
    for val in v:
        rna_dict[val] = k

xAA = 'MDFFASGLPLVTEETPSGEAGSEEDDEVVAMIKELLDTRIRPTVQEDGGDVIYKGFEDGIVQLKLQGSCTSCPSSIITLKNGIQNMLQFYIPEVEGVEQVMDDESDEKEANSP'
xmRNA = 'GUUCCCCGGCCUCUCUUGGUCAGGGUGACGCAGUAGCCUGCAAACCUCGGCGCGUAGGCCACCGCACUUAUCCGCAGCAGGACCGCCCGCAGCCGGUAGGGUGGGCUCUUCCCAGUGCCCGCCCAGCUACCGGCCAGCCUGCGGCUGCGCAGAUCUUUCGUGGUUCUGUCAGGGAGACCCUUAGGCACUCCGGACUAAGAUGGCGGCGACGGCCAGGCGGGGCUGGGGAGCUGCGGCUGUUGCCGCCGGGCUGCGCAGGCGGUUCUGUCAUAUGUUGAAGAAUCCAUACACCAUUAAGAAACAGCCUCUGCAUCAGUUUGUACAAAGACCACUUUUCCCACUACCUGCAGCCUUUUAUCACCCAGGCAGUUAUUUAGGAUUGAAGGAGUAAAAAGUGUCUUCUUUGGACCAGAUUUCAUCACUGUCACAAAGGAAAAUGAAGAAUUAGACUGGAAUUUACUGAAACCAGAUAUUUAUGCAACAAUCAUGGACUUCUUUGCAUCUGGCUUACCCCUGGUUACUGAGGAAACACCUUCAGGAGAAGCAGGAUCUGAAGAAGAUGAUGAAGUUGUGGCAAUGAUUAAGGAAUUGUUAGAUACUAGAAUACGGCCAACUGUGCAGGAAGAUGGAGGGGAUGUAAUCUACAAAGGCUUUGAAGAUGGCAUUGUACAGCUGAAACUCCAGGGUUCUUGUACCAGCUGCCCUAGUUCAAUCAUUACUCUGAAAAAUGGAAUUCAGAACAUGCUGCAGUUUUAUAUUCCGGAGGUAGAAGGCGUAGAACAGGUUAUGGAUGAUGAAUCAGAUGAAAAAGAAGCAAACUCACCUUAAAAUAAUCUGGAUUUUCUUUGGGCAUAACAGUCAGACUUGUUGAUAAUAUAUAUCAAGUUUUUAUUAUUAAUAUGCUGAGGAACUUGAAGAUUAAUAAAAUAUGCUCUUCAGAGAAUGAUAUAUAAAA'

mapped = [rna_dict[x] for x in [xmRNA[i:i+3] for i in range(0, len(xmRNA)-1, 3)]]

xmRNA_index = "".join(mapped).find(xAA) * 3
print(xmRNA_index)