C# 用一组字符代替一个字符的序列比对算法

C# 用一组字符代替一个字符的序列比对算法,c#,algorithm,bioinformatics,sequence-alignment,C#,Algorithm,Bioinformatics,Sequence Alignment,摘要: 我从一些关于对齐算法的细节开始,最后,我问我的问题。如果您了解对齐算法,请从开头开始 假设我们有两个字符串,如: ACCGAATCGA ACCGGTATTAAC 有一些算法,如:或,将这两个序列对齐并创建一个矩阵。请查看以下部分中的结果: Smith-Waterman Matrix § § A C C G A A T C G A § 0 0 0 0 0 0 0 0 0 0 0 A 0

摘要:

我从一些关于对齐算法的细节开始,最后,我问我的问题。如果您了解对齐算法,请从开头开始

假设我们有两个字符串,如:

ACCGAATCGA
ACCGGTATTAAC
有一些算法,如:或,将这两个序列对齐并创建一个矩阵。请查看以下部分中的结果:

Smith-Waterman Matrix
§   §   A   C   C   G   A   A   T   C   G   A   
§   0   0   0   0   0   0   0   0   0   0   0   
A   0   4   0   0   0   4   4   0   0   0   4   
C   0   0   13  9   4   0   4   3   9   4   0   
C   0   0   9   22  17  12  7   3   12  7   4   
G   0   0   4   17  28  23  18  13  8   18  13  
G   0   0   0   12  23  28  23  18  13  14  18  
T   0   0   0   7   18  23  28  28  23  18  14  
A   0   4   0   2   13  22  27  28  28  23  22  
T   0   0   3   0   8   17  22  32  27  26  23  
T   0   0   0   2   3   12  17  27  31  26  26  
A   0   4   0   0   2   7   16  22  27  31  30  
A   0   4   4   0   0   6   11  17  22  27  35  
C   0   0   13  13  8   3   6   12  26  22  30  

Optimal Alignments
A   C   C   G   A   -   A   T   C   G   A   
A   C   C   G   G   A   A   T   T   A   A   
问题:

我的问题很简单,但答案可能不像看上去那么简单。我想将一组字符用作单个字符,如:
[A0][C0][A1][B1]
。但在这些算法中,我们必须使用单个字符。我们如何才能做到这一点

P.S.S.考虑到我们有这样的顺序:<代码>阅读>写→添加写> /代码>。然后我把它转换成这样:#读成#写信给B#添加到C。然后我的序列变成:
ABCB
。但是我有很多不同的词是以
开头的。而且ASCII表还不足以转换所有数据。那我需要更多的角色。唯一的方法是使用类似于
[A0]。。。[Z9]
用于每个单词。或者使用数字

附言:史密斯·沃特曼的一些示例代码存在于本文档中

旁白:有人想要这样的东西,但我想要的是不同的。在这个问题中,我们有一组字符,以
[
开头,以
]
结尾。无需使用语义,如
ee
等于
i

i改编(GPL版本3许可)Smith Waterman和Needleman Wunsch算法,以支持具有多个字符组的序列:

#This software is a free software. Thus, it is licensed under GNU General Public License.
#Python implementation to Smith-Waterman Algorithm for Homework 1 of Bioinformatics class.
#Forrest Bao, Sept. 26 <http://fsbao.net> <forrest.bao aT gmail.com>

# zeros() was origianlly from NumPy.
# This version is implemented by alevchuk 2011-04-10
def zeros(shape):
    retval = []
    for x in range(shape[0]):
        retval.append([])
        for y in range(shape[1]):
            retval[-1].append(0)
    return retval

match_award      = 10
mismatch_penalty = -5
gap_penalty      = -5 # both for opening and extanding
gap = '----' # should be as long as your group of characters
space = '    ' # should be as long as your group of characters

def match_score(alpha, beta):
    if alpha == beta:
        return match_award
    elif alpha == gap or beta == gap:
        return gap_penalty
    else:
        return mismatch_penalty

def finalize(align1, align2):
    align1 = align1[::-1]    #reverse sequence 1
    align2 = align2[::-1]    #reverse sequence 2

    i,j = 0,0

    #calcuate identity, score and aligned sequeces
    symbol = []
    found = 0
    score = 0
    identity = 0
    for i in range(0,len(align1)):
        # if two AAs are the same, then output the letter
        if align1[i] == align2[i]:                
            symbol.append(align1[i])
            identity = identity + 1
            score += match_score(align1[i], align2[i])

        # if they are not identical and none of them is gap
        elif align1[i] != align2[i] and align1[i] != gap and align2[i] != gap:
            score += match_score(align1[i], align2[i])
            symbol.append(space)
            found = 0

        #if one of them is a gap, output a space
        elif align1[i] == gap or align2[i] == gap:
            symbol.append(space)
            score += gap_penalty

    identity = float(identity) / len(align1) * 100

    print 'Identity =', "%3.3f" % identity, 'percent'
    print 'Score =', score
    print ''.join(align1)
    # print ''.join(symbol)
    print ''.join(align2)


def needle(seq1, seq2):
    m, n = len(seq1), len(seq2)  # length of two sequences

    # Generate DP table and traceback path pointer matrix
    score = zeros((m+1, n+1))      # the DP table

    # Calculate DP table
    for i in range(0, m + 1):
        score[i][0] = gap_penalty * i
    for j in range(0, n + 1):
        score[0][j] = gap_penalty * j
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
            delete = score[i - 1][j] + gap_penalty
            insert = score[i][j - 1] + gap_penalty
            score[i][j] = max(match, delete, insert)

    # Traceback and compute the alignment 
    align1, align2 = [], []
    i,j = m,n # start from the bottom right cell
    while i > 0 and j > 0: # end toching the top or the left edge
        score_current = score[i][j]
        score_diagonal = score[i-1][j-1]
        score_up = score[i][j-1]
        score_left = score[i-1][j]

        if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
            align1.append(seq1[i-1])
            align2.append(seq2[j-1])
            i -= 1
            j -= 1
        elif score_current == score_left + gap_penalty:
            align1.append(seq1[i-1])
            align2.append(gap)
            i -= 1
        elif score_current == score_up + gap_penalty:
            align1.append(gap)
            align2.append(seq2[j-1])
            j -= 1

    # Finish tracing up to the top left cell
    while i > 0:
        align1.append(seq1[i-1])
        align2.append(gap)
        i -= 1
    while j > 0:
        align1.append(gap)
        align2.append(seq2[j-1])
        j -= 1

    finalize(align1, align2)

def water(seq1, seq2):
    m, n = len(seq1), len(seq2)  # length of two sequences

    # Generate DP table and traceback path pointer matrix
    score = zeros((m+1, n+1))      # the DP table
    pointer = zeros((m+1, n+1))    # to store the traceback path

    max_score = 0        # initial maximum score in DP table
    # Calculate DP table and mark pointers
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            score_diagonal = score[i-1][j-1] + match_score(seq1[i-1], seq2[j-1])
            score_up = score[i][j-1] + gap_penalty
            score_left = score[i-1][j] + gap_penalty
            score[i][j] = max(0,score_left, score_up, score_diagonal)
            if score[i][j] == 0:
                pointer[i][j] = 0 # 0 means end of the path
            if score[i][j] == score_left:
                pointer[i][j] = 1 # 1 means trace up
            if score[i][j] == score_up:
                pointer[i][j] = 2 # 2 means trace left
            if score[i][j] == score_diagonal:
                pointer[i][j] = 3 # 3 means trace diagonal
            if score[i][j] >= max_score:
                max_i = i
                max_j = j
                max_score = score[i][j];

    align1, align2 = [], []    # initial sequences

    i,j = max_i,max_j    # indices of path starting point

    #traceback, follow pointers
    while pointer[i][j] != 0:
        if pointer[i][j] == 3:
            align1.append(seq1[i-1])
            align2.append(seq2[j-1])
            i -= 1
            j -= 1
        elif pointer[i][j] == 2:
            align1.append(gap)
            align2.append(seq2[j-1])
            j -= 1
        elif pointer[i][j] == 1:
            align1.append(seq1[i-1])
            align2.append(gap)
            i -= 1

    finalize(align1, align2)
我们得到这个输出:

Needleman-Wunsch
Identity = 60.000 percent
Score = 20
[A0][C0][A1][B1]----
[A0]----[A1][B1][C1]

Smith-Waterman
Identity = 75.000 percent
Score = 25
[A0][C0][A1][B1]
[A0]----[A1][B1]

有关我所做的具体更改,请参见:。

假设我们有一个按字母顺序排列的日志文件。正如你所说的,我将序列转换为
A0A1…
。例如,如果有一个类似于
#read#write#add#write
的序列,它将转换为
A0A1A2A1
。每次,我读两个字符并比较它们,但像以前一样保留分数矩阵。这是我用C#编写的字符串对齐代码。
请注意,
Cell
是一个用户定义的类

private void alignment()
     {
        string strSeq1;
        string strSeq2;

        string strTemp1;
        string strTemp2;

        scoreMatrix = new int[Log.Length, Log.Length];

        // Lists That Holds Alignments
        List<char> SeqAlign1 = new List<char>();
        List<char> SeqAlign2 = new List<char>();

       for (int i = 0; i<Log.Length; i++ )
        {
            for (int j=i+1 ; j<Log.Length; j++)
            {
                strSeq1 = "--" + logFile.Sequence(i);
                strSeq2 = "--" + logFile.Sequence(j);

                //prepare Matrix for Computing optimal alignment
                Cell[,] Matrix = DynamicProgramming.Intialization_Step(strSeq1, strSeq2, intSim, intNonsim, intGap);

                // Trace back matrix from end cell that contains max score 
                DynamicProgramming.Traceback_Step(Matrix, strSeq1, strSeq2, SeqAlign1, SeqAlign2);

                this.scoreMatrix[i, j] = DynamicProgramming.intMaxScore;

                strTemp1 = Reverse(string.Join("", SeqAlign1));
                strTemp2 = Reverse(string.Join("", SeqAlign2));

            }
        }
}

class DynamicProgramming
{
    public  static Cell[,] Intialization_Step(string Seq1, string Seq2,int Sim,int NonSimilar,int Gap)
    {
        int M = Seq1.Length / 2 ;//Length+1//-AAA    //Changed: /2
        int N = Seq2.Length / 2 ;//Length+1//-AAA

        Cell[,] Matrix = new Cell[N, M];

        //Intialize the first Row With Gap Penality Equal To Zero 
        for (int i = 0; i < Matrix.GetLength(1); i++)
        {
            Matrix[0, i] = new Cell(0, i, 0);

        }

        //Intialize the first Column With Gap Penality Equal To Zero 
        for (int i = 0; i < Matrix.GetLength(0); i++)
        {
            Matrix[i, 0] = new Cell(i, 0, 0);

        }

        // Fill Matrix with each cell has a value result from method Get_Max
        for (int j = 1; j < Matrix.GetLength(0); j++)
        {
            for (int i = 1; i < Matrix.GetLength(1); i++)
            {
                Matrix[j, i] = Get_Max(i, j, Seq1, Seq2, Matrix,Sim,NonSimilar,Gap);
            }
        }

        return Matrix;
    }

    public  static Cell Get_Max(int i, int j, string Seq1, string Seq2, Cell[,] Matrix,int Similar,int NonSimilar,int GapPenality)
    {
        Cell Temp = new Cell();
        int intDiagonal_score;
        int intUp_Score;
        int intLeft_Score;
        int Gap = GapPenality;

        //string temp1, temp2;
        //temp1 = Seq1[i*2].ToString() + Seq1[i*2 + 1]; temp2 = Seq2[j*2] + Seq2[j*2 + 1].ToString();

        if ((Seq1[i * 2] + Seq1[i * 2 + 1]) == (Seq2[j * 2] + Seq2[j * 2 + 1]))  //Changed: +
        {
            intDiagonal_score = Matrix[j - 1, i - 1].CellScore + Similar;
        }
        else
        {
            intDiagonal_score = Matrix[j - 1, i - 1].CellScore + NonSimilar;
        }

        //Calculate gap score
        intUp_Score = Matrix[j - 1, i].CellScore + GapPenality;
        intLeft_Score = Matrix[j, i - 1].CellScore + GapPenality;

        if (intDiagonal_score<=0 && intUp_Score<=0 && intLeft_Score <= 0)
        {
            return Temp = new Cell(j, i, 0);     
        }

        if (intDiagonal_score >= intUp_Score)
        {
            if (intDiagonal_score>= intLeft_Score)
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j - 1, i - 1], Cell.PrevcellType.Diagonal);
            }
            else
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j , i - 1], Cell.PrevcellType.Left);
            }
        }
        else
        {
            if (intUp_Score >= intLeft_Score)
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j - 1, i], Cell.PrevcellType.Above);
            }
            else
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j , i - 1], Cell.PrevcellType.Left);
            }
        }

        if (MaxScore.CellScore <= Temp.CellScore)
        {
            MaxScore = Temp;
        }

        return Temp;
    }

    public static void Traceback_Step(Cell[,] Matrix, string Sq1, string Sq2, List<char> Seq1, List<char> Seq2)
    {
        intMaxScore = MaxScore.CellScore;

        while (MaxScore.CellPointer != null)
        {
            if (MaxScore.Type == Cell.PrevcellType.Diagonal)
            {

                Seq1.Add(Sq1[MaxScore.CellColumn * 2 + 1]);  //Changed: All of the following lines with *2 and +1
                Seq1.Add(Sq1[MaxScore.CellColumn * 2]);
                Seq2.Add(Sq2[MaxScore.CellRow * 2 + 1]);
                Seq2.Add(Sq2[MaxScore.CellRow * 2]);

            }
            if (MaxScore.Type == Cell.PrevcellType.Left)
            {
                Seq1.Add(Sq1[MaxScore.CellColumn * 2 + 1]);
                Seq1.Add(Sq1[MaxScore.CellColumn * 2]);
                Seq2.Add('-');

            }
            if (MaxScore.Type == Cell.PrevcellType.Above)
            {
                Seq1.Add('-');
                Seq2.Add(Sq2[MaxScore.CellRow * 2 + 1]);
                Seq2.Add(Sq2[MaxScore.CellRow * 2]);

            }

            MaxScore = MaxScore.CellPointer;

        }          

    }
}
private void alignment()
{
字符串stresq1;
字符串stresq2;
字符串strTemp1;
字符串strTemp2;
scoreMatrix=新整数[Log.Length,Log.Length];
//保存路线的列表
List SeqAlign1=新列表();
List SeqAlign2=新列表();

对于(int i=0;iI)我发现很难理解这个问题。也许你可以试着描述一下为什么要引入这些分组字符。市场上几乎所有的序列比对工具都专注于生物序列(核苷酸或肽).然而,在我的例子中,序列由数百个不同的元素组成,它们不能被编码为ASCII字符串。多亏了:)这些算法不限于ascii字符串。你可以将它们用于任意字母表。你只需要用合适的字母表来表示你的输入序列。假设我们有这个序列:#读#写#加。然后我把它转换成这样的东西:#读到A…#写到B…#加到C。然后我的序列变成:ABC。但我有很多如果是以
#
开头的不同单词,并且ASCII表不足以转换所有单词。那么我需要更多字符。唯一的方法是使用类似
[A0]…[Z9]的内容
对于每个单词。或者使用数字。谢谢亲爱的,但我想要C#语法。理解python语法需要时间。无论如何谢谢你。我希望你能多给我一点。
private void alignment()
     {
        string strSeq1;
        string strSeq2;

        string strTemp1;
        string strTemp2;

        scoreMatrix = new int[Log.Length, Log.Length];

        // Lists That Holds Alignments
        List<char> SeqAlign1 = new List<char>();
        List<char> SeqAlign2 = new List<char>();

       for (int i = 0; i<Log.Length; i++ )
        {
            for (int j=i+1 ; j<Log.Length; j++)
            {
                strSeq1 = "--" + logFile.Sequence(i);
                strSeq2 = "--" + logFile.Sequence(j);

                //prepare Matrix for Computing optimal alignment
                Cell[,] Matrix = DynamicProgramming.Intialization_Step(strSeq1, strSeq2, intSim, intNonsim, intGap);

                // Trace back matrix from end cell that contains max score 
                DynamicProgramming.Traceback_Step(Matrix, strSeq1, strSeq2, SeqAlign1, SeqAlign2);

                this.scoreMatrix[i, j] = DynamicProgramming.intMaxScore;

                strTemp1 = Reverse(string.Join("", SeqAlign1));
                strTemp2 = Reverse(string.Join("", SeqAlign2));

            }
        }
}

class DynamicProgramming
{
    public  static Cell[,] Intialization_Step(string Seq1, string Seq2,int Sim,int NonSimilar,int Gap)
    {
        int M = Seq1.Length / 2 ;//Length+1//-AAA    //Changed: /2
        int N = Seq2.Length / 2 ;//Length+1//-AAA

        Cell[,] Matrix = new Cell[N, M];

        //Intialize the first Row With Gap Penality Equal To Zero 
        for (int i = 0; i < Matrix.GetLength(1); i++)
        {
            Matrix[0, i] = new Cell(0, i, 0);

        }

        //Intialize the first Column With Gap Penality Equal To Zero 
        for (int i = 0; i < Matrix.GetLength(0); i++)
        {
            Matrix[i, 0] = new Cell(i, 0, 0);

        }

        // Fill Matrix with each cell has a value result from method Get_Max
        for (int j = 1; j < Matrix.GetLength(0); j++)
        {
            for (int i = 1; i < Matrix.GetLength(1); i++)
            {
                Matrix[j, i] = Get_Max(i, j, Seq1, Seq2, Matrix,Sim,NonSimilar,Gap);
            }
        }

        return Matrix;
    }

    public  static Cell Get_Max(int i, int j, string Seq1, string Seq2, Cell[,] Matrix,int Similar,int NonSimilar,int GapPenality)
    {
        Cell Temp = new Cell();
        int intDiagonal_score;
        int intUp_Score;
        int intLeft_Score;
        int Gap = GapPenality;

        //string temp1, temp2;
        //temp1 = Seq1[i*2].ToString() + Seq1[i*2 + 1]; temp2 = Seq2[j*2] + Seq2[j*2 + 1].ToString();

        if ((Seq1[i * 2] + Seq1[i * 2 + 1]) == (Seq2[j * 2] + Seq2[j * 2 + 1]))  //Changed: +
        {
            intDiagonal_score = Matrix[j - 1, i - 1].CellScore + Similar;
        }
        else
        {
            intDiagonal_score = Matrix[j - 1, i - 1].CellScore + NonSimilar;
        }

        //Calculate gap score
        intUp_Score = Matrix[j - 1, i].CellScore + GapPenality;
        intLeft_Score = Matrix[j, i - 1].CellScore + GapPenality;

        if (intDiagonal_score<=0 && intUp_Score<=0 && intLeft_Score <= 0)
        {
            return Temp = new Cell(j, i, 0);     
        }

        if (intDiagonal_score >= intUp_Score)
        {
            if (intDiagonal_score>= intLeft_Score)
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j - 1, i - 1], Cell.PrevcellType.Diagonal);
            }
            else
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j , i - 1], Cell.PrevcellType.Left);
            }
        }
        else
        {
            if (intUp_Score >= intLeft_Score)
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j - 1, i], Cell.PrevcellType.Above);
            }
            else
            {
                Temp = new Cell(j, i, intDiagonal_score, Matrix[j , i - 1], Cell.PrevcellType.Left);
            }
        }

        if (MaxScore.CellScore <= Temp.CellScore)
        {
            MaxScore = Temp;
        }

        return Temp;
    }

    public static void Traceback_Step(Cell[,] Matrix, string Sq1, string Sq2, List<char> Seq1, List<char> Seq2)
    {
        intMaxScore = MaxScore.CellScore;

        while (MaxScore.CellPointer != null)
        {
            if (MaxScore.Type == Cell.PrevcellType.Diagonal)
            {

                Seq1.Add(Sq1[MaxScore.CellColumn * 2 + 1]);  //Changed: All of the following lines with *2 and +1
                Seq1.Add(Sq1[MaxScore.CellColumn * 2]);
                Seq2.Add(Sq2[MaxScore.CellRow * 2 + 1]);
                Seq2.Add(Sq2[MaxScore.CellRow * 2]);

            }
            if (MaxScore.Type == Cell.PrevcellType.Left)
            {
                Seq1.Add(Sq1[MaxScore.CellColumn * 2 + 1]);
                Seq1.Add(Sq1[MaxScore.CellColumn * 2]);
                Seq2.Add('-');

            }
            if (MaxScore.Type == Cell.PrevcellType.Above)
            {
                Seq1.Add('-');
                Seq2.Add(Sq2[MaxScore.CellRow * 2 + 1]);
                Seq2.Add(Sq2[MaxScore.CellRow * 2]);

            }

            MaxScore = MaxScore.CellPointer;

        }          

    }
}