Python 在两个字符串之间查找公共子字符串
我想比较两个字符串并保持匹配,在比较失败的地方分开 如果我有两个字符串-Python 在两个字符串之间查找公共子字符串,python,string,algorithm,time-complexity,dynamic-programming,Python,String,Algorithm,Time Complexity,Dynamic Programming,我想比较两个字符串并保持匹配,在比较失败的地方分开 如果我有两个字符串- string1 = apples string2 = appleses answer = apples 另一个例子,因为字符串可以有多个单词 string1 = apple pie available string2 = apple pies answer = apple pie 我确信有一种简单的Python方法可以做到这一点,但我无法解决它,任何帮助和解释都将不胜感激 def common_start(sa,
string1 = apples
string2 = appleses
answer = apples
另一个例子,因为字符串可以有多个单词
string1 = apple pie available
string2 = apple pies
answer = apple pie
我确信有一种简单的Python方法可以做到这一点,但我无法解决它,任何帮助和解释都将不胜感激
def common_start(sa, sb):
""" returns the longest common substring from the beginning of sa and sb """
def _iter():
for a, b in zip(sa, sb):
if a == b:
yield a
else:
return
return ''.join(_iter())
或者用一种稍微奇怪的方式:
def stop_iter():
"""An easy way to break out of a generator"""
raise StopIteration
def common_start(sa, sb):
return ''.join(a if a == b else stop_iter() for a, b in zip(sa, sb))
这可能会更容易理解
def terminating(cond):
"""An easy way to break out of a generator"""
if cond:
return True
raise StopIteration
def common_start(sa, sb):
return ''.join(a for a, b in zip(sa, sb) if terminating(a == b))
尝试:
它从两个字符串的开头进行比较。这就是所谓的最长公共子字符串问题。在这里,我提出了一个简单易懂但效率低下的解决方案。为大字符串生成正确的输出需要很长时间,因为此算法的复杂性为O(N^2)
返回第一个最长的公共子字符串:
def compareTwoStrings(string1, string2):
list1 = list(string1)
list2 = list(string2)
match = []
output = ""
length = 0
for i in range(0, len(list1)):
if list1[i] in list2:
match.append(list1[i])
for j in range(i + 1, len(list1)):
if ''.join(list1[i:j]) in string2:
match.append(''.join(list1[i:j]))
else:
continue
else:
continue
for string in match:
if length < len(list(string)):
length = len(list(string))
output = string
else:
continue
return output
def comparethostrings(string1、string2):
list1=列表(string1)
list2=列表(string2)
匹配=[]
output=“”
长度=0
对于范围(0,len(列表1))中的i:
如果列表2中的列表1[i]:
match.append(列表1[i])
对于范围(i+1,len(列表1))中的j:
如果在string2中加入(列表1[i:j]):
match.append(“”.join(list1[i:j]))
其他:
持续
其他:
持续
对于匹配中的字符串:
如果长度
这不是最有效的方法,但这是我能想到的方法,而且很有效。如果有人能改进它,请做。它所做的是生成一个矩阵,并将1放在字符匹配的位置。然后,它扫描矩阵,找出1的最长对角线,跟踪它的起点和终点。然后,它返回输入字符串的子字符串,并将起始位置和结束位置作为参数
注意:这只找到一个最长的公共子字符串。如果有多个,您可以创建一个数组来存储结果并返回结果,它是区分大小写的,因此(Apple pie,Apple pie)将返回Apple pie
def longestSubstringFinder(str1, str2):
answer = ""
if len(str1) == len(str2):
if str1==str2:
return str1
else:
longer=str1
shorter=str2
elif (len(str1) == 0 or len(str2) == 0):
return ""
elif len(str1)>len(str2):
longer=str1
shorter=str2
else:
longer=str2
shorter=str1
matrix = numpy.zeros((len(shorter), len(longer)))
for i in range(len(shorter)):
for j in range(len(longer)):
if shorter[i]== longer[j]:
matrix[i][j]=1
longest=0
start=[-1,-1]
end=[-1,-1]
for i in range(len(shorter)-1, -1, -1):
for j in range(len(longer)):
count=0
begin = [i,j]
while matrix[i][j]==1:
finish=[i,j]
count=count+1
if j==len(longer)-1 or i==len(shorter)-1:
break
else:
j=j+1
i=i+1
i = i-count
if count>longest:
longest=count
start=begin
end=finish
break
answer=shorter[int(start[0]): int(end[0])+1]
return answer
与相同,但具有任意数量的要比较的字符串:
def common_start(*strings):
""" Returns the longest common substring
from the beginning of the `strings`
"""
def _iter():
for z in zip(*strings):
if z.count(z[0]) == len(z): # check all elements in `z` are the same
yield z[0]
else:
return
return ''.join(_iter())
首先是一个辅助函数,该函数根据生成的子字符串进行调整
import itertools
def n_wise(iterable, n = 2):
'''n = 2 -> (s0,s1), (s1,s2), (s2, s3), ...
n = 3 -> (s0,s1, s2), (s1,s2, s3), (s2, s3, s4), ...'''
a = itertools.tee(iterable, n)
for x, thing in enumerate(a[1:]):
for _ in range(x+1):
next(thing, None)
return zip(*a)
然后函数在子字符串上迭代,最长优先,并测试成员资格。(不考虑效率)
为完整起见,标准库中的
difflib
提供了大量序列比较实用程序。例如,在字符串上使用时查找最长的公共子字符串。示例用法:
from difflib import SequenceMatcher
string1 = "apple pie available"
string2 = "come have some apple pies"
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print(match) # -> Match(a=0, b=15, size=9)
print(string1[match.a: match.a + match.size]) # -> apple pie
print(string2[match.b: match.b + match.size]) # -> apple pie
用第一个答案修复bug:
def longestSubstringFinder(string1, string2):
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
return answer
print longestSubstringFinder("dd apple pie available", "apple pies")
print longestSubstringFinder("cov_basic_as_cov_x_gt_y_rna_genes_w1000000", "cov_rna15pcs_as_cov_x_gt_y_rna_genes_w1000000")
print longestSubstringFinder("bapples", "cappleses")
print longestSubstringFinder("apples", "apples")
def longestSubstringFinder(string1、string2):
答案=”“
len1,len2=len(string1),len(string2)
对于范围内的i(len1):
对于范围内的j(len2):
lcs_温度=0
匹配=“”
而((i+lcs_temp
def匹配管柱(x,y):
匹配=“”
对于范围(0,len(x))中的i:
对于范围(0,len(y))内的j:
k=1
#现在应用while条件,直到找到一个子串匹配,子串的长度小于x和y的长度
而(i+kdef longeststring(s1,s2):
左=0
右=len(s2)
同时,(左也可以考虑在字符上工作,因此可以用于任何字符串。
import os
common = os.path.commonprefix(['apple pie available', 'apple pies'])
assert common == 'apple pie'
正如函数名所示,这只考虑两个字符串的公共前缀。这是一个称为“最长序列查找器”的课堂问题。我给出了一些简单的代码,这些代码对我有用,而且我的输入是序列列表,也可以是字符串:
def longest_substring(list1,list2):
both=[]
if len(list1)>len(list2):
small=list2
big=list1
else:
small=list1
big=list2
removes=0
stop=0
for i in small:
for j in big:
if i!=j:
removes+=1
if stop==1:
break
elif i==j:
both.append(i)
for q in range(removes+1):
big.pop(0)
stop=1
break
removes=0
return both
Trie数据结构将比DP工作得更好。
这是代码
class TrieNode:
def __init__(self):
self.child = [None]*26
self.endWord = False
class Trie:
def __init__(self):
self.root = self.getNewNode()
def getNewNode(self):
return TrieNode()
def insert(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
root.child[index] = self.getNewNode()
root = root.child[index]
root.endWord = True
def search(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
return False
root = root.child[index]
return root.endWord
def main():
# Input keys (use only 'a' through 'z' and lower case)
keys = ["the","anaswe"]
output = ["Not present in trie",
"Present in trie"]
# Trie object
t = Trie()
# Construct trie
for key in keys:
t.insert(key)
# Search for different keys
print("{} ---- {}".format("the",output[t.search("the")]))
print("{} ---- {}".format("these",output[t.search("these")]))
print("{} ---- {}".format("their",output[t.search("their")]))
print("{} ---- {}".format("thaw",output[t.search("thaw")]))
if __name__ == '__main__':
main()
如果有疑问,请告诉我。如果我们有一个单词列表,我们需要找到所有常用的子字符串,我检查了上面的一些代码,最好的是,但是它有一些错误,例如'histhome'和'homehist'。在这种情况下,我们应该有'hist'和'home'lt.此外,如果参数的顺序发生变化,则会有所不同。因此,我更改代码以查找子字符串的每个块,并生成一组公共子字符串:
main = input().split(" ") #a string of words separated by space
def longestSubstringFinder(string1, string2):
'''Find the longest matching word'''
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
return answer
def listCheck(main):
'''control the input for finding substring in a list of words'''
string1 = main[0]
result = []
for i in range(1, len(main)):
string2 = main[i]
res1 = longestSubstringFinder(string1, string2)
res2 = longestSubstringFinder(string2, string1)
result.append(res1)
result.append(res2)
result.sort()
return result
first_answer = listCheck(main)
final_answer = []
for item1 in first_answer: #to remove some incorrect match
string1 = item1
double_check = True
for item2 in main:
string2 = item2
if longestSubstringFinder(string1, string2) != string1:
double_check = False
if double_check:
final_answer.append(string1)
print(set(final_answer))
此脚本要求您提供最小公共子字符串长度,并在两个字符串中提供所有公共子字符串。此外,它还消除了较长子字符串已经包含的较短子字符串
def common_substrings(str1,str2):
len1,len2=len(str1),len(str2)
if len1 > len2:
str1,str2=str2,str1
len1,len2=len2,len1
min_com = int(input('Please enter the minumum common substring length:'))
cs_array=[]
for i in range(len1,min_com-1,-1):
for k in range(len1-i+1):
if (str1[k:i+k] in str2):
flag=1
for m in range(len(cs_array)):
if str1[k:i+k] in cs_array[m]:
#print(str1[k:i+k])
flag=0
break
if flag==1:
cs_array.append(str1[k:i+k])
if len(cs_array):
print(cs_array)
else:
print('There is no any common substring according to the parametres given')
common_substrings('ciguliuana','ciguana')
common_substrings('apples','appleses')
common_substrings('apple pie available','apple pies')
如果这个问题没有足够的答案,这里还有另一个选择:
from collections import defaultdict
def LongestCommonSubstring(string1, string2):
match = ""
matches = defaultdict(list)
str1, str2 = sorted([string1, string2], key=lambda x: len(x))
for i in range(len(str1)):
for k in range(i, len(str1)):
cur = match + str1[k]
if cur in str2:
match = cur
else:
match = ""
if match:
matches[len(match)].append(match)
if not matches:
return ""
longest_match = max(matches.keys())
return matches[longest_match][0]
一些例子:
LongestCommonSubstring("whose car?", "this is my car")
> ' car'
LongestCommonSubstring("apple pies", "apple? forget apple pie!")
> 'apple pie'
def longeststring(s1,s2):
如果len(s1)len(maxsub):
返回s2[i:j]
我找到的最快方法是使用后缀树
包:
from suffix_trees import STree
a = ["xxxabcxxx", "adsaabc"]
st = STree.STree(a)
print(st.lcs()) # "abc"
如果string1=baples
和string2=cappleses
?。如果需要公共前缀:os.path.commonprefix(['apples','appleses'))->'apples'
`还可以查看wikibooks上的算法实现:问题的内容与标题中的内容不符。所描述的问题是最长的常见前缀。我现在希望python将其制作成。takewhile
语言功能:a表示a,b在zip中(string1,string2),而a==b
'.join(el[0]对于itertools.takewhile中的el(lambda t:t[0]==t[1],zip(“ahello”,“hello”))
返回”
,这似乎不正确。正确的结果应该是“hello”
@AndersonGreen:你说得对,这并不能完全回答问题,尽管他的例子只考虑了最初的出发点,我在回答中也指出了这一点
import os
common = os.path.commonprefix(['apple pie available', 'apple pies'])
assert common == 'apple pie'
def longest_substring(list1,list2):
both=[]
if len(list1)>len(list2):
small=list2
big=list1
else:
small=list1
big=list2
removes=0
stop=0
for i in small:
for j in big:
if i!=j:
removes+=1
if stop==1:
break
elif i==j:
both.append(i)
for q in range(removes+1):
big.pop(0)
stop=1
break
removes=0
return both
**Return the comman longest substring**
def longestSubString(str1, str2):
longestString = ""
maxLength = 0
for i in range(0, len(str1)):
if str1[i] in str2:
for j in range(i + 1, len(str1)):
if str1[i:j] in str2:
if(len(str1[i:j]) > maxLength):
maxLength = len(str1[i:j])
longestString = str1[i:j]
return longestString
class TrieNode:
def __init__(self):
self.child = [None]*26
self.endWord = False
class Trie:
def __init__(self):
self.root = self.getNewNode()
def getNewNode(self):
return TrieNode()
def insert(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
root.child[index] = self.getNewNode()
root = root.child[index]
root.endWord = True
def search(self,value):
root = self.root
for i,character in enumerate(value):
index = ord(character) - ord('a')
if not root.child[index]:
return False
root = root.child[index]
return root.endWord
def main():
# Input keys (use only 'a' through 'z' and lower case)
keys = ["the","anaswe"]
output = ["Not present in trie",
"Present in trie"]
# Trie object
t = Trie()
# Construct trie
for key in keys:
t.insert(key)
# Search for different keys
print("{} ---- {}".format("the",output[t.search("the")]))
print("{} ---- {}".format("these",output[t.search("these")]))
print("{} ---- {}".format("their",output[t.search("their")]))
print("{} ---- {}".format("thaw",output[t.search("thaw")]))
if __name__ == '__main__':
main()
main = input().split(" ") #a string of words separated by space
def longestSubstringFinder(string1, string2):
'''Find the longest matching word'''
answer = ""
len1, len2 = len(string1), len(string2)
for i in range(len1):
for j in range(len2):
lcs_temp=0
match=''
while ((i+lcs_temp < len1) and (j+lcs_temp<len2) and string1[i+lcs_temp] == string2[j+lcs_temp]):
match += string2[j+lcs_temp]
lcs_temp+=1
if (len(match) > len(answer)):
answer = match
return answer
def listCheck(main):
'''control the input for finding substring in a list of words'''
string1 = main[0]
result = []
for i in range(1, len(main)):
string2 = main[i]
res1 = longestSubstringFinder(string1, string2)
res2 = longestSubstringFinder(string2, string1)
result.append(res1)
result.append(res2)
result.sort()
return result
first_answer = listCheck(main)
final_answer = []
for item1 in first_answer: #to remove some incorrect match
string1 = item1
double_check = True
for item2 in main:
string2 = item2
if longestSubstringFinder(string1, string2) != string1:
double_check = False
if double_check:
final_answer.append(string1)
print(set(final_answer))
main = 'ABACDAQ BACDAQA ACDAQAW XYZCDAQ' #>>> {'CDAQ'}
main = 'homehist histhome' #>>> {'hist', 'home'}
def common_substrings(str1,str2):
len1,len2=len(str1),len(str2)
if len1 > len2:
str1,str2=str2,str1
len1,len2=len2,len1
min_com = int(input('Please enter the minumum common substring length:'))
cs_array=[]
for i in range(len1,min_com-1,-1):
for k in range(len1-i+1):
if (str1[k:i+k] in str2):
flag=1
for m in range(len(cs_array)):
if str1[k:i+k] in cs_array[m]:
#print(str1[k:i+k])
flag=0
break
if flag==1:
cs_array.append(str1[k:i+k])
if len(cs_array):
print(cs_array)
else:
print('There is no any common substring according to the parametres given')
common_substrings('ciguliuana','ciguana')
common_substrings('apples','appleses')
common_substrings('apple pie available','apple pies')
from collections import defaultdict
def LongestCommonSubstring(string1, string2):
match = ""
matches = defaultdict(list)
str1, str2 = sorted([string1, string2], key=lambda x: len(x))
for i in range(len(str1)):
for k in range(i, len(str1)):
cur = match + str1[k]
if cur in str2:
match = cur
else:
match = ""
if match:
matches[len(match)].append(match)
if not matches:
return ""
longest_match = max(matches.keys())
return matches[longest_match][0]
LongestCommonSubstring("whose car?", "this is my car")
> ' car'
LongestCommonSubstring("apple pies", "apple? forget apple pie!")
> 'apple pie'
def LongestSubString(s1,s2):
if len(s1)<len(s2) :
s1,s2 = s2,s1
maxsub =''
for i in range(len(s2)):
for j in range(len(s2),i,-1):
if s2[i:j] in s1 and j-i>len(maxsub):
return s2[i:j]
from suffix_trees import STree
a = ["xxxabcxxx", "adsaabc"]
st = STree.STree(a)
print(st.lcs()) # "abc"