Python 在两个字符串之间查找公共子字符串_Python_String_Algorithm_Time Complexity_Dynamic Programming

Python 在两个字符串之间查找公共子字符串

python string algorithm time-complexity

Python 在两个字符串之间查找公共子字符串,python,string,algorithm,time-complexity,dynamic-programming,Python,String,Algorithm,Time Complexity,Dynamic Programming,我想比较两个字符串并保持匹配，在比较失败的地方分开如果我有两个字符串- string1 = apples string2 = appleses answer = apples 另一个例子，因为字符串可以有多个单词 string1 = apple pie available string2 = apple pies answer = apple pie 我确信有一种简单的Python方法可以做到这一点，但我无法解决它，任何帮助和解释都将不胜感激 def common_start(sa,

我想比较两个字符串并保持匹配，在比较失败的地方分开

如果我有两个字符串-

string1 = apples
string2 = appleses

answer = apples

另一个例子，因为字符串可以有多个单词

string1 = apple pie available
string2 = apple pies

answer = apple pie

我确信有一种简单的Python方法可以做到这一点，但我无法解决它，任何帮助和解释都将不胜感激

def common_start(sa, sb):
    """ returns the longest common substring from the beginning of sa and sb """
    def _iter():
        for a, b in zip(sa, sb):
            if a == b:
                yield a
            else:
                return

    return ''.join(_iter())

或者用一种稍微奇怪的方式：

def stop_iter():
    """An easy way to break out of a generator"""
    raise StopIteration

def common_start(sa, sb):
    return ''.join(a if a == b else stop_iter() for a, b in zip(sa, sb))

这可能会更容易理解

def terminating(cond):
    """An easy way to break out of a generator"""
    if cond:
        return True
    raise StopIteration

def common_start(sa, sb):
    return ''.join(a for a, b in zip(sa, sb) if terminating(a == b))

尝试：

它从两个字符串的开头进行比较。

这就是所谓的最长公共子字符串问题。在这里，我提出了一个简单易懂但效率低下的解决方案。为大字符串生成正确的输出需要很长时间，因为此算法的复杂性为O（N^2）

返回第一个最长的公共子字符串：

def compareTwoStrings(string1, string2):
    list1 = list(string1)
    list2 = list(string2)

    match = []
    output = ""
    length = 0

    for i in range(0, len(list1)):

        if list1[i] in list2:
            match.append(list1[i])

            for j in range(i + 1, len(list1)):

                if ''.join(list1[i:j]) in string2:
                    match.append(''.join(list1[i:j]))

                else:
                    continue
        else:
            continue

    for string in match:

        if length < len(list(string)):
            length = len(list(string))
            output = string

        else:
            continue

    return output

def comparethostrings（string1、string2）：
list1=列表（string1）
list2=列表（string2）
匹配=[]
output=“”
长度=0
对于范围（0，len（列表1））中的i：
如果列表2中的列表1[i]：
match.append（列表1[i]）
对于范围（i+1，len（列表1））中的j：
如果在string2中加入（列表1[i:j]）：
match.append（“”.join（list1[i:j]））
其他：
持续
其他：
持续
对于匹配中的字符串：
如果长度

这不是最有效的方法，但这是我能想到的方法，而且很有效。如果有人能改进它，请做。它所做的是生成一个矩阵，并将1放在字符匹配的位置。然后，它扫描矩阵，找出1的最长对角线，跟踪它的起点和终点。然后，它返回输入字符串的子字符串，并将起始位置和结束位置作为参数

注意：这只找到一个最长的公共子字符串。如果有多个，您可以创建一个数组来存储结果并返回结果，它是区分大小写的，因此（Apple pie，Apple pie）将返回Apple pie

def longestSubstringFinder(str1, str2):
answer = ""

if len(str1) == len(str2):
    if str1==str2:
        return str1
    else:
        longer=str1
        shorter=str2
elif (len(str1) == 0 or len(str2) == 0):
    return ""
elif len(str1)>len(str2):
    longer=str1
    shorter=str2
else:
    longer=str2
    shorter=str1

matrix = numpy.zeros((len(shorter), len(longer)))

for i in range(len(shorter)):
    for j in range(len(longer)):               
        if shorter[i]== longer[j]:
            matrix[i][j]=1

longest=0

start=[-1,-1]
end=[-1,-1]    
for i in range(len(shorter)-1, -1, -1):
    for j in range(len(longer)):
        count=0
        begin = [i,j]
        while matrix[i][j]==1:

            finish=[i,j]
            count=count+1 
            if j==len(longer)-1 or i==len(shorter)-1:
                break
            else:
                j=j+1
                i=i+1

        i = i-count
        if count>longest:
            longest=count
            start=begin
            end=finish
            break

answer=shorter[int(start[0]): int(end[0])+1]
return answer

与相同，但具有任意数量的要比较的字符串：

def common_start(*strings):
    """ Returns the longest common substring
        from the beginning of the `strings`
    """
    def _iter():
        for z in zip(*strings):
            if z.count(z[0]) == len(z):  # check all elements in `z` are the same
                yield z[0]
            else:
                return

    return ''.join(_iter())

首先是一个辅助函数，该函数根据生成的子字符串进行调整

import itertools
def n_wise(iterable, n = 2):
    '''n = 2 -> (s0,s1), (s1,s2), (s2, s3), ...

    n = 3 -> (s0,s1, s2), (s1,s2, s3), (s2, s3, s4), ...'''
    a = itertools.tee(iterable, n)
    for x, thing in enumerate(a[1:]):
        for _ in range(x+1):
            next(thing, None)
    return zip(*a)

然后函数在子字符串上迭代，最长优先，并测试成员资格。（不考虑效率）

为完整起见，标准库中的

difflib

提供了大量序列比较实用程序。例如，在字符串上使用时查找最长的公共子字符串。示例用法：

from difflib import SequenceMatcher

string1 = "apple pie available"
string2 = "come have some apple pies"

match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))

print(match)  # -> Match(a=0, b=15, size=9)
print(string1[match.a: match.a + match.size])  # -> apple pie
print(string2[match.b: match.b + match.size])  # -> apple pie

用第一个答案修复bug：

def longestSubstringFinder(string1, string2):
    answer = ""
    len1, len2 = len(string1), len(string2)
    for i in range(len1):
        for j in range(len2):
            lcs_temp=0
            match=''
            while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
                match += string2[j+lcs_temp]
                lcs_temp+=1
            if (len(match) > len(answer)):
                answer = match
    return answer

print longestSubstringFinder("dd apple pie available", "apple pies")
print longestSubstringFinder("cov_basic_as_cov_x_gt_y_rna_genes_w1000000", "cov_rna15pcs_as_cov_x_gt_y_rna_genes_w1000000")
print longestSubstringFinder("bapples", "cappleses")
print longestSubstringFinder("apples", "apples")

def longestSubstringFinder（string1、string2）：
答案=”“
len1，len2=len（string1），len（string2）
对于范围内的i（len1）：
对于范围内的j（len2）：
lcs_温度=0
匹配=“”
而（（i+lcs_temp

def匹配管柱（x，y）：
匹配=“”
对于范围（0，len（x））中的i：
对于范围（0，len（y））内的j：
k=1
#现在应用while条件，直到找到一个子串匹配，子串的长度小于x和y的长度
而（i+kdef longeststring（s1，s2）：
左=0
右=len（s2）
同时，（左也可以考虑在字符上工作，因此可以用于任何字符串。
import os
common = os.path.commonprefix(['apple pie available', 'apple pies'])
assert common == 'apple pie'

正如函数名所示，这只考虑两个字符串的公共前缀。
这是一个称为“最长序列查找器”的课堂问题。我给出了一些简单的代码，这些代码对我有用，而且我的输入是序列列表，也可以是字符串：
def longest_substring(list1,list2):
    both=[]
    if len(list1)>len(list2):
        small=list2
        big=list1
    else:
        small=list1
        big=list2
    removes=0
    stop=0
    for i in small:
        for j in big:
            if i!=j:
                removes+=1
                if stop==1:
                    break
            elif i==j:
                both.append(i)
                for q in range(removes+1):
                    big.pop(0)
                stop=1
                break
        removes=0
    return both

Trie数据结构将比DP工作得更好。
这是代码
class TrieNode:
    def __init__(self):
        self.child = [None]*26
        self.endWord = False

class Trie:

    def __init__(self):
        self.root = self.getNewNode()

    def getNewNode(self):
        return TrieNode()

    def insert(self,value):
        root = self.root


        for i,character in enumerate(value):
            index = ord(character) - ord('a')
            if not root.child[index]:
                root.child[index] = self.getNewNode()
            root = root.child[index]

        root.endWord = True


    def search(self,value):
        root = self.root

        for i,character in enumerate(value):
            index = ord(character) - ord('a')
            if not root.child[index]:
                return False
            root = root.child[index]
        return root.endWord

def main(): 

    # Input keys (use only 'a' through 'z' and lower case) 
    keys = ["the","anaswe"] 
    output = ["Not present in trie", 
            "Present in trie"] 

    # Trie object 
    t = Trie() 

    # Construct trie 
    for key in keys: 
        t.insert(key) 

    # Search for different keys 
    print("{} ---- {}".format("the",output[t.search("the")])) 
    print("{} ---- {}".format("these",output[t.search("these")])) 
    print("{} ---- {}".format("their",output[t.search("their")])) 
    print("{} ---- {}".format("thaw",output[t.search("thaw")])) 

if __name__ == '__main__': 
    main() 

如果有疑问，请告诉我。
如果我们有一个单词列表，我们需要找到所有常用的子字符串，我检查了上面的一些代码，最好的是，但是它有一些错误，例如'histhome'和'homehist'。在这种情况下，我们应该有'hist'和'home'lt.此外，如果参数的顺序发生变化，则会有所不同。因此，我更改代码以查找子字符串的每个块，并生成一组公共子字符串：
main = input().split(" ")    #a string of words separated by space
def longestSubstringFinder(string1, string2):
    '''Find the longest matching word'''
    answer = ""
    len1, len2 = len(string1), len(string2)
    for i in range(len1):
        for j in range(len2):
            lcs_temp=0
            match=''
            while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
                match += string2[j+lcs_temp]
                lcs_temp+=1         
            if (len(match) > len(answer)):
                answer = match              
    return answer

def listCheck(main):
    '''control the input for finding substring in a list of words'''
    string1 = main[0]
    result = []
    for i in range(1, len(main)):
        string2 = main[i]
        res1 = longestSubstringFinder(string1, string2)
        res2 = longestSubstringFinder(string2, string1)
        result.append(res1)
        result.append(res2)
    result.sort()
    return result

first_answer = listCheck(main)

final_answer  = []


for item1 in first_answer:    #to remove some incorrect match
    string1 = item1
    double_check = True
    for item2 in main:
        string2 = item2
        if longestSubstringFinder(string1, string2) != string1:
            double_check = False
    if double_check:
        final_answer.append(string1)

print(set(final_answer))

此脚本要求您提供最小公共子字符串长度，并在两个字符串中提供所有公共子字符串。此外，它还消除了较长子字符串已经包含的较短子字符串
def common_substrings(str1,str2):
    len1,len2=len(str1),len(str2)

    if len1 > len2:
        str1,str2=str2,str1
        len1,len2=len2,len1

    min_com = int(input('Please enter the minumum common substring length:'))
    
    cs_array=[]
    for i in range(len1,min_com-1,-1):
        for k in range(len1-i+1):
            if (str1[k:i+k] in str2):
                flag=1
                for m in range(len(cs_array)):
                    if str1[k:i+k] in cs_array[m]:
                    #print(str1[k:i+k])
                        flag=0
                        break
                if flag==1:
                    cs_array.append(str1[k:i+k])
    if len(cs_array):
        print(cs_array)
    else:
        print('There is no any common substring according to the parametres given')

common_substrings('ciguliuana','ciguana')
common_substrings('apples','appleses')
common_substrings('apple pie available','apple pies')

如果这个问题没有足够的答案，这里还有另一个选择：
from collections import defaultdict
def LongestCommonSubstring(string1, string2):
    match = ""
    matches = defaultdict(list)
    str1, str2 = sorted([string1, string2], key=lambda x: len(x))

    for i in range(len(str1)):
        for k in range(i, len(str1)):
            cur = match + str1[k]
            if cur in str2:
                match = cur
            else:
                match = ""
            
            if match:
                matches[len(match)].append(match)
        
    if not matches:
        return ""

    longest_match = max(matches.keys())
        
    return matches[longest_match][0]


一些例子：
LongestCommonSubstring("whose car?", "this is my car")
> ' car'
LongestCommonSubstring("apple pies", "apple? forget apple pie!")
> 'apple pie'


def longeststring（s1，s2）：
如果len（s1）len（maxsub）：
返回s2[i:j]
我找到的最快方法是使用后缀树
包：
from suffix_trees import STree

a = ["xxxabcxxx", "adsaabc"]
st = STree.STree(a)
print(st.lcs()) # "abc"

如果string1=baples
和string2=cappleses
？。如果需要公共前缀：os.path.commonprefix（['apples'，'appleses'））->'apples'
`还可以查看wikibooks上的算法实现：问题的内容与标题中的内容不符。所描述的问题是最长的常见前缀。我现在希望python将其制作成。takewhile
语言功能：a表示a，b在zip中（string1，string2），而a==b
'.join（el[0]对于itertools.takewhile中的el（lambda t:t[0]==t[1]，zip（“ahello”，“hello”））
返回”
，这似乎不正确。正确的结果应该是“hello”@AndersonGreen:你说得对，这并不能完全回答问题，尽管他的例子只考虑了最初的出发点，我在回答中也指出了这一点
import os
common = os.path.commonprefix(['apple pie available', 'apple pies'])
assert common == 'apple pie'

def longest_substring(list1,list2):
    both=[]
    if len(list1)>len(list2):
        small=list2
        big=list1
    else:
        small=list1
        big=list2
    removes=0
    stop=0
    for i in small:
        for j in big:
            if i!=j:
                removes+=1
                if stop==1:
                    break
            elif i==j:
                both.append(i)
                for q in range(removes+1):
                    big.pop(0)
                stop=1
                break
        removes=0
    return both

**Return the comman longest substring** 
def longestSubString(str1, str2):
    longestString = ""
    maxLength = 0
    for i in range(0, len(str1)):
        if str1[i] in str2:
            for j in range(i + 1, len(str1)):
                if str1[i:j] in str2:
                    if(len(str1[i:j]) > maxLength):
                        maxLength = len(str1[i:j])
                        longestString =  str1[i:j]
return longestString

class TrieNode:
    def __init__(self):
        self.child = [None]*26
        self.endWord = False

class Trie:

    def __init__(self):
        self.root = self.getNewNode()

    def getNewNode(self):
        return TrieNode()

    def insert(self,value):
        root = self.root


        for i,character in enumerate(value):
            index = ord(character) - ord('a')
            if not root.child[index]:
                root.child[index] = self.getNewNode()
            root = root.child[index]

        root.endWord = True


    def search(self,value):
        root = self.root

        for i,character in enumerate(value):
            index = ord(character) - ord('a')
            if not root.child[index]:
                return False
            root = root.child[index]
        return root.endWord

def main(): 

    # Input keys (use only 'a' through 'z' and lower case) 
    keys = ["the","anaswe"] 
    output = ["Not present in trie", 
            "Present in trie"] 

    # Trie object 
    t = Trie() 

    # Construct trie 
    for key in keys: 
        t.insert(key) 

    # Search for different keys 
    print("{} ---- {}".format("the",output[t.search("the")])) 
    print("{} ---- {}".format("these",output[t.search("these")])) 
    print("{} ---- {}".format("their",output[t.search("their")])) 
    print("{} ---- {}".format("thaw",output[t.search("thaw")])) 

if __name__ == '__main__': 
    main() 

main = input().split(" ")    #a string of words separated by space
def longestSubstringFinder(string1, string2):
    '''Find the longest matching word'''
    answer = ""
    len1, len2 = len(string1), len(string2)
    for i in range(len1):
        for j in range(len2):
            lcs_temp=0
            match=''
            while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
                match += string2[j+lcs_temp]
                lcs_temp+=1         
            if (len(match) > len(answer)):
                answer = match              
    return answer

def listCheck(main):
    '''control the input for finding substring in a list of words'''
    string1 = main[0]
    result = []
    for i in range(1, len(main)):
        string2 = main[i]
        res1 = longestSubstringFinder(string1, string2)
        res2 = longestSubstringFinder(string2, string1)
        result.append(res1)
        result.append(res2)
    result.sort()
    return result

first_answer = listCheck(main)

final_answer  = []


for item1 in first_answer:    #to remove some incorrect match
    string1 = item1
    double_check = True
    for item2 in main:
        string2 = item2
        if longestSubstringFinder(string1, string2) != string1:
            double_check = False
    if double_check:
        final_answer.append(string1)

print(set(final_answer))

main = 'ABACDAQ BACDAQA ACDAQAW XYZCDAQ' #>>> {'CDAQ'}
main = 'homehist histhome' #>>> {'hist', 'home'}


def common_substrings(str1,str2):
    len1,len2=len(str1),len(str2)

    if len1 > len2:
        str1,str2=str2,str1
        len1,len2=len2,len1

    min_com = int(input('Please enter the minumum common substring length:'))
    
    cs_array=[]
    for i in range(len1,min_com-1,-1):
        for k in range(len1-i+1):
            if (str1[k:i+k] in str2):
                flag=1
                for m in range(len(cs_array)):
                    if str1[k:i+k] in cs_array[m]:
                    #print(str1[k:i+k])
                        flag=0
                        break
                if flag==1:
                    cs_array.append(str1[k:i+k])
    if len(cs_array):
        print(cs_array)
    else:
        print('There is no any common substring according to the parametres given')

common_substrings('ciguliuana','ciguana')
common_substrings('apples','appleses')
common_substrings('apple pie available','apple pies')

from collections import defaultdict
def LongestCommonSubstring(string1, string2):
    match = ""
    matches = defaultdict(list)
    str1, str2 = sorted([string1, string2], key=lambda x: len(x))

    for i in range(len(str1)):
        for k in range(i, len(str1)):
            cur = match + str1[k]
            if cur in str2:
                match = cur
            else:
                match = ""
            
            if match:
                matches[len(match)].append(match)
        
    if not matches:
        return ""

    longest_match = max(matches.keys())
        
    return matches[longest_match][0]


LongestCommonSubstring("whose car?", "this is my car")
> ' car'
LongestCommonSubstring("apple pies", "apple? forget apple pie!")
> 'apple pie'


def LongestSubString(s1,s2):
    if len(s1)<len(s2) :
        s1,s2 = s2,s1  
    
    maxsub =''
    for i in range(len(s2)):
        for j in range(len(s2),i,-1):
            if s2[i:j] in s1 and j-i>len(maxsub):                
                return  s2[i:j]

from suffix_trees import STree

a = ["xxxabcxxx", "adsaabc"]
st = STree.STree(a)
print(st.lcs()) # "abc"