Python Needleman-Wunsch算法动态规划实现中的回溯

Python Needleman-Wunsch算法动态规划实现中的回溯,python,algorithm,dynamic-programming,bioinformatics,Python,Algorithm,Dynamic Programming,Bioinformatics,我几乎让我的needleman wunsch实现工作,但我对如何处理特定案例的回溯感到困惑 其思想是为了重新构建序列(最长路径),我们重新计算以确定得分来自的矩阵。我遇到的边缘情况是,右下角的分数不在匹配矩阵中,而是在插入列矩阵中(这意味着生成的追溯序列应该有一个插入) 这些序列是以a2m格式记录的,序列中的插入被记录为小写字符。因此在最终输出中,ZZ到AAAC的对齐应该是AAAC。当我用手回溯时,我以AAAC结束,因为我只访问了一次Ic矩阵。是我的白板。正如你所看到的,我有三个黑色箭头和一个绿

我几乎让我的needleman wunsch实现工作,但我对如何处理特定案例的回溯感到困惑

其思想是为了重新构建序列(最长路径),我们重新计算以确定得分来自的矩阵。我遇到的边缘情况是,右下角的分数不在匹配矩阵中,而是在插入列矩阵中(这意味着生成的追溯序列应该有一个插入)

这些序列是以a2m格式记录的,序列中的插入被记录为小写字符。因此在最终输出中,
ZZ
AAAC
的对齐应该是
AAAC
。当我用手回溯时,我以
AAAC
结束,因为我只访问了一次Ic矩阵。是我的白板。正如你所看到的,我有三个黑色箭头和一个绿色箭头,这就是为什么我的回溯给我
AAAc
。我应该数一数第一个单元格,然后停在位置1,1吗?我不确定如何改变我实现这一点的方式

请注意,此处使用的替换矩阵为BLOSUM62。递归关系为

M(i,j) = max(Ic(i-1,j-1)+subst, M(i-1,j-1)+subst, Ir(i-1,j-1)+subst)
Ic(i,j) = max(Ic(i,j-1)-extend, M(i,j-1)-open, Ir(i,j-1)-double)
Ir(i,j) = max(Ic(i-1,j)-double, M(i-1,j)-open, Ir(i-1,j)-extend)
编辑:这是重新编写为更干净的traceback_col_seq函数。请注意,score_cell现在返回thisM、thisC、thisR,而不是它们的最大值。此版本将对齐方式作为AaAc进行评分,仍然存在相同的问题,现在还有另一个问题,即为什么它在1,2处再次进入Ic。然而,此版本更清晰

def traceback_col_seq(self):
    i, j = self.maxI-1, self.maxJ-1
    self.traceback = list()
    matrixDict = {0:'M',1:'Ic',2:'Ir',3:'M',4:'Ic',5:'Ir',6:'M',7:'Ic',8:'Ir'}
    while i > 0 or j > 0:
        chars = self.col_seq[j-1] + self.row_seq[i-1]
        thisM, thisC, thisR = self.score_cell(i, j, chars)
        cell = thisM + thisC + thisR
        prevMatrix = matrixDict[cell.index(max(cell))]
        print(cell, prevMatrix,i,j)
        if prevMatrix == 'M':
            i -= 1; j -= 1
            self.traceback.append(self.col_seq[j])
        elif prevMatrix == 'Ic':
            j -= 1
            self.traceback.append(self.col_seq[j].lower())
        elif prevMatrix == 'Ir':
            i -= 1
            self.traceback.append('-')
    return ''.join(self.traceback[::-1])
下面是python类,它生成动态编程矩阵并追溯对齐方式。还有一个score函数用于检查生成的对齐方式是否正确

class global_aligner():
    def __init__(self, subst, open=12, extend=1, double=3):
        self.extend, self.open, self.double, self.subst = extend, open, double, subst
    def __call__(self, row_seq, col_seq):
        #add alphabet error checking?
        score_align(row_seq, col_seq)
        return traceback_col_seq()
    def init_array(self):
        """initialize three numpy arrays, set values in 1st column and row"""
        self.M = zeros((self.maxI, self.maxJ), float)
        self.Ic = zeros((self.maxI, self.maxJ), float)
        self.Ir = zeros((self.maxI, self.maxJ), float)
        for i in xrange(1,self.maxI):
            self.M[i][0], self.Ic[i][0], self.Ir[i][0] = \
                    -float('inf'), -float('inf'), -(self.open+self.extend*(i-1))
        for j in xrange(1,self.maxJ):
            self.M[0][j], self.Ir[0][j], self.Ic[0][j] = \
                    -float('inf'), -float('inf'), -(self.open+self.extend*(j-1))
        self.Ic[0][0] = self.Ir[0][0] = -float('inf')
    def score_cell(self, i, j, chars):
        """score a matrix cell based on the 9 total neighbors (3 each direction)"""
        thisM = [self.M[i-1][j-1]+self.subst[chars], self.Ic[i-1][j-1]+ \
                self.subst[chars], self.Ir[i-1][j-1]+self.subst[chars]]
        thisC = [self.M[i][j-1]-self.open, self.Ic[i][j-1]-self.extend, \
                        self.Ir[i][j-1]-self.double]
        thisR = [self.M[i-1][j]-self.open, self.Ic[i-1][j]-self.double, \
                self.Ir[i-1][j]-self.extend]
        return max(thisM), max(thisC), max(thisR)
    def score_align(self, row_seq, col_seq):
        """build dynamic programming matrices to align two sequences"""
        self.row_seq, self.col_seq = list(row_seq), list(col_seq)
        self.maxI, self.maxJ = len(self.row_seq)+1, len(self.col_seq)+1
        self.init_array() #initialize arrays
        for i in xrange(1, self.maxI):
            row_char = self.row_seq[i-1]
            for j in xrange(1, self.maxJ):
                chars = row_char+self.col_seq[j-1]
                self.M[i][j], self.Ic[i][j], self.Ir[i][j] = self.score_cell(i, j, chars)
    def traceback_col_seq(self):
        """trace back column sequence in matrices in a2m format"""
        i, j = self.maxI-1, self.maxJ-1
        self.traceback = list()
        #find which matrix to start in
        #THIS IS WHERE THE PROBLEM LIES I THINK
        cell = (self.M[i][j], self.Ic[i][j], self.Ir[i][j])
        prevMatrix = cell.index(max(cell))
        while i > 1 and j > 1:
            if prevMatrix == 0: #M matrix
                i -= 1; j -= 1 #step up diagonally
                prevChars = self.row_seq[i-1]+self.col_seq[j-1]
                diag = self.score_cell(i, j, prevChars) #re-score diagonal cell
                prevMatrix = diag.index(max(diag)) #determine which matrix that was
                self.traceback.append(self.col_seq[j])
            elif prevMatrix == 1: #Ic matrix
                j -= 1 
                prevChars = self.row_seq[i-1]+self.col_seq[j-1]
                left = self.score_cell(i, j, prevChars)
                prevMatrix = left.index(max(left))
                self.traceback.append(self.col_seq[j].lower())
            elif prevMatrix == 2: #Ir matrix
                i -= 1
                prevChars = self.row_seq[i-1]+self.col_seq[j-1]
                up = self.score_cell(i, j, prevChars)
                prevMatrix = up.index(max(up))
                self.traceback.append('-')
        for j in xrange(j,0,-1): #hit top of matrix before ending, add chars
            self.traceback.append(self.col_seq[j-1])
        for i in xrange(i,0,-1): #hit left of matrix before ending, add gaps
            self.traceback.append('-')
        print(''.join(self.row[::-1]))
        return ''.join(self.traceback[::-1])
    def score_a2m(self, s1, s2):
        """scores an a2m alignment of two sequences. I believe this function correctly
        scores alignments and is used to test my alignments. The value produced by this
        function should be the same as the largest value in the bottom right of the three
        matrices"""
        s1, s2 = list(s1.strip('.')), list(s2.strip('.'))
        s1_pos, s2_pos = len(s1)-1, len(s2)-1
        score, gap = 0, False
        while s1_pos >= 0 and s2_pos >= 0:
            if s1[s1_pos].islower() and gap is False:
                score -= self.open; s1_pos -= 1; gap = True
            elif s1[s1_pos].islower() and gap is True:
                score -= self.extend; s1_pos -= 1
            elif s2[s2_pos].islower() and gap is False:
                score -= self.open; s2_pos -= 1; gap = True
            elif s2[s2_pos].islower() and gap is True:
                score -= self.extend; s2_pos -= 1
            elif s1[s1_pos] == '-' and gap is False:
                score -= self.open; s1_pos -= 1; s2_pos -= 1; gap = True
            elif s1[s1_pos] == '-' and gap is True:
                score -= self.extend; s1_pos -= 1; s2_pos -= 1
            elif s2[s2_pos] == '-' and gap is False:
                score -= self.open; s1_pos -= 1; s2_pos -= 1; gap = True
            elif s2[s2_pos] == '-' and gap is True:
                score -= self.extend; s1_pos -= 1; s2_pos -= 1
            elif gap is True:
                score += self.subst[s1[s1_pos].upper() + s2[s2_pos].upper()]
                s1_pos -= 1; s2_pos -= 1; gap = False
            else:
                score += self.subst[s1[s1_pos].upper() + s2[s2_pos].upper()]
                s1_pos -= 1; s2_pos -= 1
        if s1_pos >= 0 and gap is True:
            score -= self.extend*s1_pos
        elif s1_pos >= 0 and gap is False:
            score -= self.open+s1_pos*self.extend
        if s2_pos >= 0 and gap is True:
            score -= self.extend*s2_pos
        elif s2_pos >= 0 and gap is False:
            score -= self.open+s2_pos*self.extend
        return score


test = global_aligner(blosumMatrix)
s1,s2 = 'ZZ','AAAC'
test.score_align(s1, s2)
align = test.traceback_col_seq()
print('This score: ', test.score_a2m(s1,align)
print('Correct score: ', test.score_a2m(s1,'AAac'))
Blosum解析器

def parse_blosum(blosumFile):
    blosumMatrix, commentFlag = dict(), False
    for line in blosumFile:
        if not line.startswith('#') and not commentFlag:
            alphabet = line.rstrip().split()
            commentFlag = True
        elif commentFlag:
            line = line.rstrip().split()
            thisChar, line = line[0], line[1:]
            for x in xrange(len(line)):
                alphaChar, thisValue = alphabet[x], line[x]
                blosumMatrix[thisChar+alphaChar] = int(thisValue)
    return blosumMatrix

正如您在关于箭头颜色的评论中提到的,基本问题是,第二个水平箭头被标记为M中的移动(黑色箭头),而实际上它是Ic中的移动(绿色箭头)。这似乎是因为
prevMatrix
变量指示(i-1,j-1),(i-1,j)处的最佳矩阵或者(i,j-1)。从正向角度看,这是前一个单元格中的矩阵。因为此时您正在执行trace*back*,所以您在跟踪中“来自”的单元格实际上是(i,j)-您当前所在的一个。它确定您移动的方向,从而确定您是否正在对齐间隙。您最好的选择可能是使用一个变量,在循环迭代之间跟踪当前矩阵,并使用此变量确定要附加到对齐字符串的字符。在e在每次迭代中,您可以在确定下一步要使用哪个单元格后更新它


我认为进行这些更改将澄清重新编写的traceback\u col\u sequence函数遇到的问题,但初步看来,您没有考虑通过追溯获得的信息。您正在重用
score\u cell()
,它看起来是为计算向前的分数而设计的。当您回溯时,您已经知道了结尾,因此您不想为上一个单元格选择最大分数。您想选择可能导致您当前所在矩阵中位置的最大分数。例如,如果您知道矩阵你在你的回溯中是M,然后你知道你没有通过扩大差距而到达那里,所以你不应该考虑那些选项。

谢谢你的回复。事实上,我从这篇文章中找到了答案。你是正确的,主要的问题是在什么时候回顾。重新使用相同的评分函数是很好的。当您在正确的时间查看单元格并添加正确的字母时。请参阅我的新答案,了解traceback_col_seq()函数的实现。感谢本文和我的另一篇文章中的帮助。啊,是的,好的,这很有意义。很高兴您找到了它!
def traceback_col_seq(self):
    """
    Traces back the column sequence to determine final global alignment.
    Recalculates the score using score_cell. 
    """
    i, j = self.maxI-1, self.maxJ-1
    self.traceback = list() #stores final sequence
    matrixDict = {0:'M',1:'Ic',2:'Ir'} #mapping between numeric value and matrix
    chars = self.col_seq[j-1] + self.row_seq[i-1] #store first characters
    thisM, thisC, thisR = self.score_cell(i,j, chars) 
    cell = max(thisM), max(thisC), max(thisR) #determine where to start
    prevMatrix = matrixDict[cell.index(max(cell))] #store where to go first
    while i > 0 or j > 0:
        #loop until the top left corner of the matrix is reached
        if prevMatrix == 'M':
            self.traceback.append(self.col_seq[j-1])
            i -= 1; j -= 1
            prevMatrix = matrixDict[thisM.index(max(thisM))]
        elif prevMatrix == 'Ic':
            self.traceback.append(self.col_seq[j-1].lower())
            j -= 1
            prevMatrix = matrixDict[thisC.index(max(thisC))]
        elif prevMatrix == 'Ir':
            self.traceback.append('-')
            i -= 1
            prevMatrix = matrixDict[thisR.index(max(thisR))]
        chars = self.col_seq[j-1] + self.row_seq[i-1] #store next characters
        thisM, thisC, thisR = self.score_cell(i,j, chars) #rescore next cell
    return ''.join(self.traceback[::-1])