Python 计算DNA序列中的三联体_Python_Count_Bioinformatics_Biopython

Python 计算DNA序列中的三联体

python

Python 计算DNA序列中的三联体,python,count,bioinformatics,biopython,Python,Count,Bioinformatics,Biopython,我想做一个代码，计算序列中的所有三元组。到目前为止，我已经读了很多帖子，但没有一篇能帮到我这是我的代码： def cnt(seq): mydict = {} if len(seq) % 3 == 0: a = [x for x in seq] for i in range(len(seq)//3): b = ''.join(a[(0+3*i):(3+3*i)]) for base1 in ['A',

我想做一个代码，计算序列中的所有三元组。到目前为止，我已经读了很多帖子，但没有一篇能帮到我

这是我的代码：

def cnt(seq):
    mydict = {}
    if len(seq) % 3 == 0:
        a = [x for x in seq]
        for i in range(len(seq)//3):
            b = ''.join(a[(0+3*i):(3+3*i)])
            for base1 in ['A', 'T', 'G', 'C']:
                for base2 in ['A', 'T', 'G', 'C']:
                    for base3 in ['A', 'T', 'G', 'C']:
                        triplet = base1 + base2 + base3
                        if b == triplet:
                            mydict[b] = 1
        for key in sorted(mydict):
            print("%s: %s" % (key, mydict[key]))
    else:
        print("Error")

Biopython是否提供了解决此问题的功能

编辑：

注意例如，在序列“ATGAG”中，“TGA”或“GAA”不是“有效的”三联体，只有“ATG”和“AAG”，因为在生物学和生物信息学中，我们读到的是“ATG”和“AAG”，这是我们翻译它或其他任何东西所需要的信息

你可以把它想象成一系列单词，例如“Hello world”。我们读它的方式是“你好”和“世界”，而不是“你好”、“埃洛”、“罗w”…

你可以这样做：

from itertools import product

seq = 'ATGATG'

all_triplets = [seq[i:i+3] for i in range(len(seq)) if i <= len(seq)-3]
# this gives ['ATG', 'TGA', 'GAT', 'ATG']

# add more valid_triplets here
valid_triplets = ['ATG']

len([(i, j) for i, j in product(valid_triplets, all_triplets) if i==j])

来自itertools导入产品的


seq='ATGATG'
如果i不清楚预期的输出是什么，则all_triplets=[seq[i:i+3]表示范围内的i（len（seq））。这里我们使用的函数之一是构建相邻的三元组或“密码子”
通过调用len
计算密码子的数量
len(triplets)
# 2


为了更详细的分析，考虑将问题分解为更小的函数，（1）提取密码子和（2）计算发生。
代码
import collections as ct


def split_codons(seq):
    "Return codons from a sequence; raise for bad sequences."
    for w in mit.windowed(seq, n=3, step=3, fillvalue=""):
        part = "".join(w)
        if len(part) < 3:
            raise ValueError(f"Sequence not divisible by 3.  Got extra '{part}'.")
        yield part


def count_codons(codons):
    """Return dictionary of codon occurences."""
    dd = ct.defaultdict(int)
    for i, c in enumerate(codons, 1):
        dd[c] += 1
    return {k: (v, 100 * v/i) for k, v in dd.items()}

我花了一段时间才明白，您不想计算密码子的数量，而是要计算每个密码子的频率。您的标题在这方面有点误导。无论如何，您可以为您的任务使用：
from collections import Counter

def cnt(seq):
    if len(seq) % 3 == 0:
        #split list into codons of three
        codons = [seq[i:i+3] for i in range(0, len(seq), 3)]
        #create Counter dictionary for it
        codon_freq = Counter(codons)
        #determine number of codons, should be len(seq) // 3
        n = sum(codon_freq.values())
        #print out all entries in an appealing form
        for key in sorted(codon_freq):
            print("{}: {} = {:5.2f}%".format(key, codon_freq[key], codon_freq[key] * 100 / n))
        #or just the dictionary
        #print(codon_freq)
    else:
        print("Error")

seq = "ATCGCAGAAATCCGCAGAATC"

cnt(seq)

样本输出：
AGA: 1 = 14.29%
ATC: 3 = 42.86%
CGC: 1 = 14.29%
GAA: 1 = 14.29%
GCA: 1 = 14.29%

正如其他答案中所建议的那样，您可以使用巧妙的技巧，但我将从您的代码开始构建一个解决方案，这几乎是可行的：您的问题是每次执行mydict[b]=1
，您都会将b
的计数重置为1
最小修复
您可以通过测试键是否存在来解决这个问题，如果不存在，则在dict中创建条目，然后增加值，但是python中有更方便的工具
对代码的最小更改是使用defaultdict（int）
而不是dict。每当遇到新键时，都假定该键具有int:0的关联默认值。因此，您可以增加该值，而不是重置：
from collections import defaultdict

def cnt(seq):
     # instanciate a defaultdict that creates ints when necessary
     mydict = defaultdict(int)
     if len(seq) % 3 == 0:
         a = [x for x in seq]
         for i in range(len(seq)//3):
             b = ''.join(a[(0+3*i):(3+3*i)])
             for base1 in ['A', 'T', 'G', 'C']:
                 for base2 in ['A', 'T', 'G', 'C']:
                     for base3 in ['A', 'T', 'G', 'C']:
                         triplet = base1 + base2 + base3
                         if b == triplet:
                             # increment the existing count (or the default 0 value)
                             mydict[b] += 1
         for key in sorted(mydict):
             print("%s: %s" % (key, mydict[key]))
     else:
         print("Error")

它可以根据需要工作：
cnt("ACTGGCACT")
ACT: 2
GGC: 1

一些可能的改进
现在让我们试着改进一下您的代码
首先，正如我在评论中所写，让我们避免不必要地将序列转换为列表，并为当前计数的密码子使用更好的变量名：
from collections import defaultdict

def cnt(seq):
     mydict = defaultdict(int)
     if len(seq) % 3 == 0:
         a = [x for x in seq]
         for i in range(len(seq)//3):
             codon = seq[(0+3*i):(3+3*i)]
             for base1 in ['A', 'T', 'G', 'C']:
                 for base2 in ['A', 'T', 'G', 'C']:
                     for base3 in ['A', 'T', 'G', 'C']:
                         triplet = base1 + base2 + base3
                         if codon == triplet:
                             mydict[codon] += 1
         for key in sorted(mydict):
             print("%s: %s" % (key, mydict[key]))
     else:
         print("Error")

现在，让我们通过提前生成一组可能的密码子来简化嵌套循环部分，尝试所有可能的密码子：
from collections import defaultdict
from itertools import product

codons = {
    "".join((base1, base2, base3))
        for (base1, base2, base3) in product("ACGT", "ACGT", "ACGT")}

def cnt(seq):
     mydict = defaultdict(int)
     if len(seq) % 3 == 0:
         a = [x for x in seq]
         for i in range(len(seq)//3):
             codon = seq[(0+3*i):(3+3*i)]
             if codon in codons:
                 mydict[codon] += 1
         for key in sorted(mydict):
             print("%s: %s" % (key, mydict[key]))
     else:
         print("Error")

现在，您的代码只是忽略了无效密码子的三元组。也许您应该发出警告：
from collections import defaultdict
from itertools import product

codons = {
    "".join((base1, base2, base3))
        for (base1, base2, base3) in product("ACGT", "ACGT", "ACGT")}

def cnt(seq):
     mydict = defaultdict(int)
     if len(seq) % 3 == 0:
         a = [x for x in seq]
         for i in range(len(seq)//3):
             codon = seq[(0+3*i):(3+3*i)]
             # We count even invalid triplets
             mydict[codon] += 1
         # We display counts only for valid triplets
         for codon in sorted(codons):
             print("%s: %s" % (codon, mydict[codon]))
         # We compute the set of invalid triplets:
         # the keys that are not codons.
         invalid = mydict.keys() - codons
         # An empty set has value False in a test.
         # We issue a warning if the set is not empty.
         if invalid:
             print("Warning! There are invalid triplets:")
             print(", ".join(sorted(invalid)))
     else:
         print("Error")

更奇特的解决方案
现在有一个更奇特的解决方案，使用（可能需要安装，因为它不是普通python发行版的一部分：pip3安装cytoolz
，如果您使用的是pip）：
希望这能有所帮助。
1.你知道你的seq是否与有效的三联体对齐，或者在开始时有一些“挂起”（一个或两个）碱基吗？2.像“ACTTTC”这样的有效序列呢？你希望所有可能的三联体都计数（“ACT”、“CTT”、“TTT”和“TTC”），还是只计算“ACT”和“TTC”？3.为什么是“GAT”无效？您不需要将seq
转换为列表a
：如果seq
是一个字符串，它的行为与列表类似，您可以直接执行b=seq[（0+3*i）：（3+3*i）]
。此外，最好使用比b
更有意义的变量名，如codon或类似的名称。
from collections import defaultdict
from itertools import product

codons = {
    "".join((base1, base2, base3))
        for (base1, base2, base3) in product("ACGT", "ACGT", "ACGT")}

def cnt(seq):
     mydict = defaultdict(int)
     if len(seq) % 3 == 0:
         a = [x for x in seq]
         for i in range(len(seq)//3):
             codon = seq[(0+3*i):(3+3*i)]
             if codon in codons:
                 mydict[codon] += 1
         for key in sorted(mydict):
             print("%s: %s" % (key, mydict[key]))
     else:
         print("Error")

from collections import defaultdict
from itertools import product

codons = {
    "".join((base1, base2, base3))
        for (base1, base2, base3) in product("ACGT", "ACGT", "ACGT")}

def cnt(seq):
     mydict = defaultdict(int)
     if len(seq) % 3 == 0:
         a = [x for x in seq]
         for i in range(len(seq)//3):
             codon = seq[(0+3*i):(3+3*i)]
             # We count even invalid triplets
             mydict[codon] += 1
         # We display counts only for valid triplets
         for codon in sorted(codons):
             print("%s: %s" % (codon, mydict[codon]))
         # We compute the set of invalid triplets:
         # the keys that are not codons.
         invalid = mydict.keys() - codons
         # An empty set has value False in a test.
         # We issue a warning if the set is not empty.
         if invalid:
             print("Warning! There are invalid triplets:")
             print(", ".join(sorted(invalid)))
     else:
         print("Error")

from collections import Counter
from itertools import product, repeat
from cytoolz import groupby, keymap, partition 

# To make strings out of lists of strings
CAT = "".join

# The star "extracts" the elements from the result of repeat,
# so that product has 3 arguments, and not a single one
codons = {CAT(bases) for bases in product(*repeat("ACGT", 3))}

def cnt(seq):
    # keymap(CAT, ...) transforms the keys (that are tuples of letters)
    # into strings
    # if len(seq) is not a multiple of 3, pad="-" will append "-"
    # to complete the last triplet (which will be an invalid one)
    codon_counts = keymap(CAT, Counter(partition(3, seq, pad="-")))

    # separate encountered codons into valids and invalids
    codons_by_validity = groupby(codons.__contains__, codon_counts.keys())
    # get allows to provide a default value,
    # in case one of the categories is not present
    valids = codons_by_validity.get(True, [])
    invalids = codons_by_validity.get(False, [])

    # We display counts only for valid triplets
    for codon in sorted(valids):
        print("%s: %s" % (codon, codon_counts[codon]))

    # We issue a warning if there are invalid codons.
    if invalids:
        print("Warning! There are invalid triplets:")
        print(", ".join(sorted(invalids)))