Python中生成矩阵的Smith-Waterman算法_Python_Matrix_Alignment_Bioinformatics

Python中生成矩阵的Smith-Waterman算法

python matrix

Python中生成矩阵的Smith-Waterman算法,python,matrix,alignment,bioinformatics,Python,Matrix,Alignment,Bioinformatics,我正在使用Python生成一个动态编程矩阵以下是我目前掌握的情况： def score(base1,base2): base1=base1.upper() base2=base2.upper() if base1 not in 'ACTG' or base2 not in 'ACTG': print 'Not DNA base!' sys.exit() elif base1==base2: return 3

我正在使用Python生成一个动态编程矩阵

以下是我目前掌握的情况：

def score(base1,base2):
    base1=base1.upper()
    base2=base2.upper()
    if base1 not in 'ACTG' or base2 not in 'ACTG':
        print 'Not DNA base!'
        sys.exit()
    elif base1==base2:
        return 3
    elif base1+base2=='AG' or base1+base2=='GA':
        return -1
    elif base1+base2=='CT' or base1+base2=='TC':
        return -1
    else:
        return -2
import sys

seq1 = sys.argv[1]
seq2 = sys.argv[2]
mRows = len(seq1)
nCols = len(seq2)
gap = int(sys.argv[3])
matrix = []

# generate empty matrix
for x in range(mRows + 1):
    matrix.append([])
    for y in range(nCols + 1):
        matrix[x].append(0)

for i in range(1, mRows + 1): 
    for j in range(1, nCols + 1):
        dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1])
        vscore = matrix[i-1][j] + gap
        hscore = matrix[i][j-1] + gap
        matrix[i][j]=max(0, vscore, hscore, dscore)

输入： sw.py ATGCAT账户-1

我得到矩阵输出：

0       0       0       0       0
0       0       0       0       0
0       0       0       0       3
0       0       0       0       2
0       0       0       0       1
0       0       0       0       0
0       0       0       0       3

通过一些故障排除，我可以看到在嵌套for循环中，只有使用最终值j（对于这个特定输入，4）的分数存储在矩阵中，即仅存储在最后一列

我的问题是，为什么会发生这种情况，我该如何应对？为什么for循环跳回，而不是继续变量分数

我的一些疑难解答：

for i in range(1, mRows + 1): 
    for j in range(1, nCols + 1):
        print 'this is i', i
        print 'this is j', j
        print 'seq1', seq1[i-1], 'seq2', seq2[j-1]
        dscore = matrix[i-1][j-1] + score(seq1[i-1], seq2[j-1])
        vscore = matrix[i-1][j] + gap
        hscore = matrix[i][j-1] + gap
        matrix[i][j]=max(0, vscore, hscore, dscore)
        print 'Vscore = ', vscore
        print 'Hscore = ', hscore
        print 'Dscore = ', dscore
        print '\n'

给出：

this is i 1
this is j 1
seq1 A seq2 A
this is i 1
this is j 2
seq1 A seq2 C
this is i 1
this is j 3
seq1 A seq2 C
this is i 1
this is j 4
seq1 A seq2 T
Vscore =  -1
Hscore =  -1
Dscore =  -2


this is i 2
this is j 1
seq1 T seq2 A
this is i 2
this is j 2
seq1 T seq2 C
this is i 2
this is j 3
seq1 T seq2 C
this is i 2
this is j 4
seq1 T seq2 T
Vscore =  -1
Hscore =  -1
Dscore =  3


this is i 3
this is j 1
seq1 G seq2 A
this is i 3
this is j 2
seq1 G seq2 C
this is i 3
this is j 3
seq1 G seq2 C
this is i 3
this is j 4
seq1 G seq2 T
Vscore =  2
Hscore =  -1
Dscore =  -2


this is i 4
this is j 1
seq1 C seq2 A
this is i 4
this is j 2
seq1 C seq2 C
this is i 4
this is j 3
seq1 C seq2 C
this is i 4
this is j 4
seq1 C seq2 T
Vscore =  1
Hscore =  -1
Dscore =  -1


this is i 5
this is j 1
seq1 A seq2 A
this is i 5
this is j 2
seq1 A seq2 C
this is i 5
this is j 3
seq1 A seq2 C
this is i 5
this is j 4
seq1 A seq2 T
Vscore =  0
Hscore =  -1
Dscore =  -2


this is i 6
this is j 1
seq1 T seq2 A
this is i 6
this is j 2
seq1 T seq2 C
this is i 6
this is j 3
seq1 T seq2 C
this is i 6
this is j 4
seq1 T seq2 T
Vscore =  -1
Hscore =  -1
Dscore =  3

谢谢

我为Smith-Waterman算法编写了一个类，您可能会发现它很有用。我建议在这个任务中使用numpy数组，因为对于越来越大的序列，它们通常比列表更有效

class LocalAligner(object):
#Constructor
def __init__(self, match=None, mismatch=None, gap=None, fileP=None, fileQ=None):
    #Default args
    #StringQ = GCTGGAAGGCAT
    #StringP = GCAGAGCACG
    #Match Score = +5, Mismatch Score = -4, Gap Penalty = -4
    if match is None and mismatch is None and gap is None and fileP is None and fileQ is None:
        #string1
        self.q = "GCTGGAAGGCAT"
        self.stringQName = "Sequence Q"

        #string2
        self.p = "GCAGAGCACG"
        self.stringPName = "Sequence P"

        #Scoring parameter
        self.gapPen = -4
        self.mismatchPen = -4
        self.matchScore = 5

    #User has given sequences and scoring arguments to the object
    elif match is not None and mismatch is not None and gap is not None and fileP is not None and fileQ is not None:
        #Default string name if one is not present in the file
        self.stringQName = "String Q"
        self.q = self.parseFile(fileQ, 1)

        #Default string name if one is not present in the file
        self.stringQName = "String P"
        self.p = self.parseFile(fileP, 2)

        #Scoring parameters given at the command line
        self.gapPen = int(gap)
        self.mismatchPen = int(mismatch)
        self.matchScore = int(match)


    #Final sequence alignments
    self.finalQ = ""
    self.finalP = ""

    #Create a table and initialize to zero
    #We will use numpy arrays as they are generally more efficient than lists for large amounts of data (ie sequences)
    self.MatrixA = np.empty(shape=[len(self.p)+1,len(self.q)+1])

    #Create b table
    self.MatrixB = np.empty(shape=[len(self.p)+1,len(self.q)+1])

    #Store max score and location
    self.maxScore = 0
    self.maxI = None
    self.maxJ =None

#Populates the A and B tables
#A table holds the scores and the B table holds the direction of the optimal solution for each sub problem
def calcTables(self):
    #insert initial blank string 1
    try:
        self.q = '-' + self.q
    except IOError:
        print("Error with sequence 1")

    #insert initial blank string 2
    try:
        self.p = '-' + self.p
    except IOError:
        print("Error with sequence 2")

    #Initialize row and column 0 for A and B tables
    self.MatrixA[:,0] = 0
    self.MatrixA[0,:] = 0
    self.MatrixB[:,0] = 0
    self.MatrixB[0,:] = 0

    for i in range(1,len(self.p)):
        for j in range(1, len(self.q)):

            #Look for match
            if self.p[i] == self.q[j]:
                #Match found
                self.MatrixA[i][j] = self.MatrixA[i-1][j-1] + self.matchScore
                #3 == "diagonal" for traversing solution
                self.MatrixB[i][j] = 3

                #Check for max score
                if self.MatrixA[i][j] > self.maxScore:
                    self.maxScore = self.MatrixA[i][j]
                    self.maxI = i
                    self.maxJ = j

            #Match not found
            else:
                self.MatrixA[i][j] = self.findMaxScore(i,j)


#Finds the maximum score either in the north or west neighbor in the A table
#Due to the ordering, gaps are checked first
def findMaxScore(self, i, j):

    #North score
    qDelet = self.MatrixA[i-1][j] + self.gapPen
    #West score
    pDelet = self.MatrixA[i][j-1] + self.gapPen
    #Diagonal Score
    mismatch = self.MatrixA[i-1][j-1] + self.mismatchPen

    #Determine the max score
    maxScore = max(qDelet, pDelet, mismatch)

    #Set B table
    if qDelet == maxScore:
        self.MatrixB[i][j] = 2 #2 == "up" for traversing solution

    elif pDelet == maxScore:
        self.MatrixB[i][j] = 1 #1 == "left" for traversing solution

    elif mismatch == maxScore:
        self.MatrixB[i][j] = 3 #3 == "diagonal" for traversing solution

    return maxScore

#Calculate the alignment with the highest score by tracing back the highest scoring local solution
#Integers:
#3 -> "DIAGONAL" -> match
#2 -> "UP" -> gap in string q
#1 -> "LEFT" -> gap in string p
#were used in the B table
def calcAlignemnt(self, i=None, j=None):

    #Default arguments to the maximum score in the A table
    if i is None and j is None:
        i = self.maxI
        j = self.maxJ

    #Base case, end of the local alignment
    if i == 0 or j == 0:
        return

    #Optimal solution "DIAGONAL"
    if self.MatrixB[i][j] == 3:
        self.calcAlignemnt(i-1 , j-1)
        self.finalQ += self.q[j]
        self.finalP += self.p[i]

    else:
        #Optimal solution "UP"
        if self.MatrixB[i][j] == 2:
            self.calcAlignemnt(i-1, j)
            self.finalQ += '-'
            self.finalP += self.p[i]

        else:
            #Optimal solution "LEFT"
            self.calcAlignemnt(i, j-1)
            self.finalP += '-'
            self.finalQ += self.p[j]

#Parse the input sequence file for string
#Assumes fasta format
def parseFile(self, filePath, stringNumber):
    #Empty sequence
    seq = ""
    #Parse the file
    with open(filePath) as f:
        for line in f:
            #Remove new line characters
            line = line.replace('\r',"")   #Windows
            line = line.replace('\n', "")

            #Header encountered
            if line.startswith(">"):
                if stringNumber == 2:
                    self.stringQName = line.replace('>',"")
                    continue
                elif stringNumber == 1:
                    self.stringPName = line.replace('>',"")
                    continue
                else:
                    continue

            #Append line
            seq += line
        f.close()
    return seq

谢谢不管是什么问题，我都会随意解决。你的解决方案肯定比我的更优雅：