Python 字符串中的模糊模式搜索:具有d-不匹配的最常见模式

Python 字符串中的模糊模式搜索:具有d-不匹配的最常见模式,python,bioinformatics,Python,Bioinformatics,我想找到所有的模式是 1) 字符串中最频繁的 2) 最多有d-不匹配 对于这个给定的任务,我已经实现了一个函数,该函数计算给定模式在具有d-不匹配的字符串中出现的次数。该算法的思想基于使用字符串子模式的位掩码和给定模式的位掩码的卷积。它产生正确的结果。下面是此算法的代码: def create_bit_mask(letter, text): buf_array=[] for c in text: if c==letter: buf_array.append(1)

我想找到所有的模式是 1) 字符串中最频繁的 2) 最多有d-不匹配

对于这个给定的任务,我已经实现了一个函数,该函数计算给定模式在具有d-不匹配的字符串中出现的次数。该算法的思想基于使用字符串子模式的位掩码和给定模式的位掩码的卷积。它产生正确的结果。下面是此算法的代码:

def create_bit_mask(letter, text):
buf_array=[]
for c in text:
    if c==letter:
        buf_array.append(1)
    else:
        buf_array.append(0)
return buf_array

def convolution(bit_mask1, bit_mask2):
return sum(b*q for b,q in zip(bit_mask1, bit_mask2))

def number_of_occurances_with_at_most_hamming_distance(genome,pattern,hamming_distance):
alphabet=["A","C","G","T"]
matches=0
matrix_of_bit_arrays_for_pattern=[]
matrix_of_bit_arrays_for_genome=[]
buf_output=0
buf=0
positions=""

for symbol in alphabet:
    matrix_of_bit_arrays_for_pattern.append(create_bit_mask(symbol,pattern))
    matrix_of_bit_arrays_for_genome.append(create_bit_mask(symbol, genome))

for i in xrange(len(genome)-len(pattern)+1):
    buf_debug=[]

    buf=sum(convolution(bit_mask_pattern,bit_mask_genome[i:i+len(pattern)]) for bit_mask_pattern, bit_mask_genome in zip(matrix_of_bit_arrays_for_pattern,matrix_of_bit_arrays_for_genome))
    hamming=len(pattern)-buf
    if hamming<=hamming_distance:
        buf_output+=1
        #print "current window: ", genome[i:i+len(pattern)], "pattern :", pattern,"number of mismatches : ", hamming, " @ position : ",i 

return buf_output
但是,上面的代码找不到以下子字符串:

GCACACAGAC
你能给我打个电话吗,为什么?我不希望您发布正确的代码,而是告诉我错误在哪里(我假设错误可能在第二个函数中)


另外,我确实意识到我必须在Stepic在线课程上解决以下任务,但由于没有来自学习小组的在线社会的反馈,我已经在StackOverflow上发布了我的代码

我在pyparsing列表中遇到过类似的基因组解析问题,我提出了这个CloseMatch解析器类。它将您的大部分字符串遍历和测试代码封装在pyparsing自己的字符串解析框架中,但这仍然可以让您对自己的代码有一些了解:

genome = "CACAGTAGGCGCCGGCACACACAGCCCCGGGCCCCGGGCCGCCCCGGGCCGGCGGCCGCCGGCGCCGGCACACCGGCACAGCCGTACCGGCACAGTAGTACCGGCCGGCCGGCACACCGGCACACCGGGTACACACCGGGGCGCACACACAGGCGGGCGCCGGGCCCCGGGCCGTACCGGGCCGCCGGCGGCCCACAGGCGCCGGCACAGTACCGGCACACACAGTAGCCCACACACAGGCGGGCGGTAGCCGGCGCACACACACACAGTAGGCGCACAGCCGCCCACACACACCGGCCGGCCGGCACAGGCGGGCGGGCGCACACACACCGGCACAGTAGTAGGCGGCCGGCGCACAGCC"
length=10
hamming_distance=2

from pyparsing import Token, ParseException
# following from pyparsing.wikispaces.com Examples page
class CloseMatch(Token): 
    """A special subclass of Token that does *close* matches. For each
       close match of the given string, a tuple is returned giving the
       found close match, and a list of mismatch positions."""
    def __init__(self, seq, maxMismatches=1): 
        super(CloseMatch,self).__init__() 
        self.name = seq 
        self.sequence = seq 
        self.maxMismatches = maxMismatches 
        self.errmsg = "Expected " + self.sequence 
        self.mayIndexError = False 
        self.mayReturnEmpty = False 

    def parseImpl( self, instring, loc, doActions=True ): 
        start = loc 
        instrlen = len(instring) 
        maxloc = start + len(self.sequence) 

        if maxloc <= instrlen: 
            seq = self.sequence 
            seqloc = 0 
            mismatches = [] 
            throwException = False 
            done = False 
            while loc < maxloc and not done: 
                if instring[loc] != seq[seqloc]: 
                    mismatches.append(seqloc) 
                    if len(mismatches) > self.maxMismatches: 
                        throwException = True 
                        done = True 
                loc += 1 
                seqloc += 1 
        else: 
            throwException = True 

        if throwException: 
            #~ exc = self.myException 
            #~ exc.loc = loc 
            #~ exc.pstr = instring 
            #~ raise exc 
            raise ParseException(instring, loc, self.errmsg)

        return loc, (instring[start:loc],mismatches) 


# first walk genome, get all unique N-character patterns
patterns = set()
for i in range(len(genome)-length):
    patterns.add(genome[i:i+length])
print len(patterns)

# use pyparsing's CloseMatch to find close matches - each match
# returns the substring and the list of mismatch locations
matches = {}
for p in sorted(patterns):
    matcher = CloseMatch(p, hamming_distance)
    matches[p] = list(matcher.scanString(genome, overlap=True))

# Now list out all patterns and number of close matches - for the most
# commonly matched pattern, dump out all matches, where they occurred and
# an annotated match showing the mismatch locations
first = True
for p in sorted(matches, key=lambda m: -len(matches[m])):
    if first:
        first = False
        for matchdata in matches[p]:
            matchvalue, start, end = matchdata
            substring,mismatches = matchvalue[0]
            print ' ', substring, 'at', start
            if mismatches:
                print ' ', ''.join('^' if i in mismatches else ' ' for i in range(length))
            else:
                print ' ', "***EXACT***"
            print
    print p, len(matches[p])

aaaaaand,这里是一个基于re的解决方案:

genome = "CACAGTAGGCGCCGGCACACACAGCCCCGGGCCCCGGGCCGCCCCGGGCCGGCGGCCGCCGGCGCCGGCACACCGGCACAGCCGTACCGGCACAGTAGTACCGGCCGGCCGGCACACCGGCACACCGGGTACACACCGGGGCGCACACACAGGCGGGCGCCGGGCCCCGGGCCGTACCGGGCCGCCGGCGGCCCACAGGCGCCGGCACAGTACCGGCACACACAGTAGCCCACACACAGGCGGGCGGTAGCCGGCGCACACACACACAGTAGGCGCACAGCCGCCCACACACACCGGCCGGCCGGCACAGGCGGGCGGGCGCACACACACCGGCACAGTAGTAGGCGGCCGGCGCACAGCC"
length=10
hamming_distance=2


import re
from itertools import product

charset = "ACGT"

# first walk genome, get all unique N-character patterns
patterns = set()
for i in range(len(genome)-length):
    patterns.add(genome[i:i+length])


# for each pattern, create re's with all possible alternates
allmatches = {}
for p in sorted(patterns):
    options = [[c,"["+charset+"]"] for c in p]
    proditer = product(*options)

    matches = set()
    for pr in proditer:
        # count how many elements in this product start with '['
        # only want up to hamming_distance number of alternatives
        numalts = sum(prpart.startswith('[') for prpart in pr)
        if numalts > hamming_distance:
            continue

        compiled_pattRE = re.compile(''.join(pr))
        for match in filter(None, (compiled_pattRE.match(genome,i) 
                                        for i in range(len(genome)-length+1))):
            matches.add((match.start(), match.group(0)))

    allmatches[p] = matches

for p,matches in sorted(allmatches.items(), key=lambda m: -len(m[1])):
    print p, len(matches)
    for m in sorted(matches):
        print m
    print
    break
印刷品:

GCGCACACAC 20
(12, 'CGGCACACAC')
(14, 'GCACACACAG')
(126, 'GGGTACACAC')
(138, 'GGGCGCACAC')
(140, 'GCGCACACAC')
(142, 'GCACACACAG')
(213, 'CGGCACACAC')
(215, 'GCACACACAG')
(227, 'GCCCACACAC')
(253, 'GCGCACACAC')
(255, 'GCACACACAC')
(257, 'ACACACACAC')
(272, 'GCGCACAGCC')
(280, 'CCGCCCACAC')
(282, 'GCCCACACAC')
(284, 'CCACACACAC')
(316, 'GGGCGCACAC')
(318, 'GCGCACACAC')
(320, 'GCACACACAC')
(351, 'GCGCACAGCC')

我是否正确理解最常见的两个不匹配模式是
gcacacac
?如果是,那么我的程序和你的程序给出的结果相同,这可能表明我们的解决方案是正确的。是的,你理解正确。即使pyparsing代码不能直接应用或访问,您也可以从我的输出代码中删除,在匹配的子字符串下显示不匹配的标记。正如你所说的,当不同的方法产生相同的结果时,这是令人鼓舞的——这增强了你的信心,你的答案实际上可能是正确的。这段代码中有一个“错误”,它只会从按匹配项降序排列的模式列表中转储第一个。如果有两种模式匹配的数量最多,那么这将只显示第一种模式,并且不能保证哪种模式是“第一”。Pyparsing不再托管在wikispaces.com上。好吧,我打赌有以下概念上的错误(不是在你和我的代码中)。我正在寻找的子模式(d-错配)可能不在基因组中。因此,将给定的基因组切片到子模式中,对于给定的子模式执行“模糊”计数可能会得到比以下方法更少的结果:a)根据给定的字母表生成具有给定长度的所有字符串集b)从上述集合中的每个字符串执行带有d-不匹配的模糊计数。啊,好主意!我将修改以生成4**10组合,并查看其中是否有任何组合的命中率高于从基因组切片的10个字符串中的任何一个。(可能有一个数学证明可以或不可能有这样的解决方案。)好吧,如果你实现暴力解决方案,它将非常耗时。我正在努力寻找更有效的算法来实现这个想法。哈!这是可能发生的!我实现了一个暴力解决方案,将长度改为6,最常见的匹配最多有2个错误是“CCCGCC”,它在基因组中的任何地方都不存在。你是如何在给定的字母表上生成具有给定长度的所有字符串的?我正在使用以下代码来完成此操作:
alphabet=[“A”、“C”、“G”、“T”]buf_results=['''.join(I)for I in itertools.product(alphabet,repeat=length)]
genome = "CACAGTAGGCGCCGGCACACACAGCCCCGGGCCCCGGGCCGCCCCGGGCCGGCGGCCGCCGGCGCCGGCACACCGGCACAGCCGTACCGGCACAGTAGTACCGGCCGGCCGGCACACCGGCACACCGGGTACACACCGGGGCGCACACACAGGCGGGCGCCGGGCCCCGGGCCGTACCGGGCCGCCGGCGGCCCACAGGCGCCGGCACAGTACCGGCACACACAGTAGCCCACACACAGGCGGGCGGTAGCCGGCGCACACACACACAGTAGGCGCACAGCCGCCCACACACACCGGCCGGCCGGCACAGGCGGGCGGGCGCACACACACCGGCACAGTAGTAGGCGGCCGGCGCACAGCC"
length=10
hamming_distance=2


import re
from itertools import product

charset = "ACGT"

# first walk genome, get all unique N-character patterns
patterns = set()
for i in range(len(genome)-length):
    patterns.add(genome[i:i+length])


# for each pattern, create re's with all possible alternates
allmatches = {}
for p in sorted(patterns):
    options = [[c,"["+charset+"]"] for c in p]
    proditer = product(*options)

    matches = set()
    for pr in proditer:
        # count how many elements in this product start with '['
        # only want up to hamming_distance number of alternatives
        numalts = sum(prpart.startswith('[') for prpart in pr)
        if numalts > hamming_distance:
            continue

        compiled_pattRE = re.compile(''.join(pr))
        for match in filter(None, (compiled_pattRE.match(genome,i) 
                                        for i in range(len(genome)-length+1))):
            matches.add((match.start(), match.group(0)))

    allmatches[p] = matches

for p,matches in sorted(allmatches.items(), key=lambda m: -len(m[1])):
    print p, len(matches)
    for m in sorted(matches):
        print m
    print
    break
GCGCACACAC 20
(12, 'CGGCACACAC')
(14, 'GCACACACAG')
(126, 'GGGTACACAC')
(138, 'GGGCGCACAC')
(140, 'GCGCACACAC')
(142, 'GCACACACAG')
(213, 'CGGCACACAC')
(215, 'GCACACACAG')
(227, 'GCCCACACAC')
(253, 'GCGCACACAC')
(255, 'GCACACACAC')
(257, 'ACACACACAC')
(272, 'GCGCACAGCC')
(280, 'CCGCCCACAC')
(282, 'GCCCACACAC')
(284, 'CCACACACAC')
(316, 'GGGCGCACAC')
(318, 'GCGCACACAC')
(320, 'GCACACACAC')
(351, 'GCGCACAGCC')