Python 显示字符串对齐方式
我有一个程序可以告诉我两个字符串之间的距离,这个程序运行得很好 e、 g 从一个到另一个的成本为5(用e替换i为2,有3个插入) 基本上,插入成本为1,删除成本为1,替换成本为2。单词也可以在字符串中混洗以降低成本 我需要一种方法来记住在什么点上发生了什么操作,以便显示对齐 e、 g 有什么想法或建议吗Python 显示字符串对齐方式,python,Python,我有一个程序可以告诉我两个字符串之间的距离,这个程序运行得很好 e、 g 从一个到另一个的成本为5(用e替换i为2,有3个插入) 基本上,插入成本为1,删除成本为1,替换成本为2。单词也可以在字符串中混洗以降低成本 我需要一种方法来记住在什么点上发生了什么操作,以便显示对齐 e、 g 有什么想法或建议吗 import sys from sys import stdout def minEditDist(target, source): # Length of the target
import sys
from sys import stdout
def minEditDist(target, source):
# Length of the target strings set to variables
n = len(target)
m = len(source)
distance = [[0 for i in range(m+1)] for j in range(n+1)]
for i in range(1,n+1):
distance[i][0] = distance[i-1][0] + insertCost(target[i-1])
for j in range(1,m+1):
distance[0][j] = distance[0][j-1] + deleteCost(source[j-1])
for i in range(1,n+1):
for j in range(1,m+1):
distance[i][j] = min(distance[i-1][j]+1,
distance[i][j-1]+1,
distance[i-1][j-1]+subCost(source[j-1],target[i-1]))
# Return the minimum distance using all the table cells
return distance[i][j]
def subCost(x,y):
if x == y:
return 0
else:
return 2
def insertCost(x):
return 1
def deleteCost(x):
return 1
# User inputs the strings for comparison
# Commented out here because cloud9 won't take input like this
# word1 = raw_input("Enter A Word: ")
# word2 = raw_input("Enter The Second Word: ")
word1 = "wax"
word2 = "and"
word1x = word1
word2x = word2
# Reassign variables to words with stripped right side whitespace
word1x = word1x.strip()
word2x = word2x.strip()
if(len(word1) > len(word2)):
range_num = len(word1)
else:
range_num = len(word2)
# Display the minimum distance between the two specified strings
print "The minimum edit distance between S1 and S2 is: ", minEditDist(word1x,word2x), "!"
print (word1x)
print (word2x)
你可以这样开始 我已经为“S”添加了适当的数据 您正在计算(或者更好,一个加权的Levenshtein距离,因为您的操作成本不同:
I
/D
=>1,M
=>2)
要获得操作顺序,一种常见的方法是进行某种回溯
考虑以下方法backtrace
*:
...
# Return the minimum distance using all the table cells
def backtrace(i, j):
if i>0 and j>0 and distance[i-1][j-1] + 2 == distance[i][j]:
return backtrace(i-1, j-1) + "S"
if i>0 and j>0 and distance[i-1][j-1] == distance[i][j]:
return backtrace(i-1, j-1) + "M"
if i>0 and distance[i-1][j] + 1 == distance[i][j]:
return backtrace(i-1, j) + "D"
if j>0 and distance[i][j-1] + 1 == distance[i][j]:
return backtrace(i, j-1) + "I"
return ""
return distance[i][j], backtrace(i, j)
我将它作为嵌套方法添加到您的方法中,这样我就不必将距离矩阵distance
作为参数传递给它
现在您的脚本输出
S1和S2之间的最小编辑距离为:(4,'SMS')
还要注意,如果您想在python中使用Levenshtein距离,有一个名为
*可能不是100%准确:-)这不会像您在这里使用的方式工作。您正在
insertCost
和deleteCost
中向path
添加元素,但是调用这些方法来预填充距离矩阵,因此您在此处构建的路径总是以I
,I
,D
,D
,D
开始(三个字).我在回答中特别提到,我只是为substitute
做的。
import sys
from sys import stdout
def minEditDist(target, source):
# Length of the target strings set to variables
n = len(target)
m = len(source)
distance = [[0 for i in range(m+1)] for j in range(n+1)]
for i in range(1,n+1):
distance[i][0] = distance[i-1][0] + insertCost(target[i-1])
for j in range(1,m+1):
distance[0][j] = distance[0][j-1] + deleteCost(source[j-1])
for i in range(1,n+1):
for j in range(1,m+1):
distance[i][j] = min(distance[i-1][j]+1,
distance[i][j-1]+1,
distance[i-1][j-1]+subCost(source[j-1],target[i-1]))
# Return the minimum distance using all the table cells
return distance[i][j]
def subCost(x,y):
if x == y:
return 0
else:
return 2
def insertCost(x):
return 1
def deleteCost(x):
return 1
# User inputs the strings for comparison
# Commented out here because cloud9 won't take input like this
# word1 = raw_input("Enter A Word: ")
# word2 = raw_input("Enter The Second Word: ")
word1 = "wax"
word2 = "and"
word1x = word1
word2x = word2
# Reassign variables to words with stripped right side whitespace
word1x = word1x.strip()
word2x = word2x.strip()
if(len(word1) > len(word2)):
range_num = len(word1)
else:
range_num = len(word2)
# Display the minimum distance between the two specified strings
print "The minimum edit distance between S1 and S2 is: ", minEditDist(word1x,word2x), "!"
print (word1x)
print (word2x)
path = []
def minEditDist(target, source):
# Length of the target strings set to variables
n = len(target)
m = len(source)
distance = [[0 for i in range(m+1)] for j in range(n+1)]
for i in range(1,n+1):
distance[i][0] = distance[i-1][0] + insertCost(target[i-1])
for j in range(1,m+1):
distance[0][j] = distance[0][j-1] + deleteCost(source[j-1])
for i in range(1,n+1):
for j in range(1,m+1):
sc = subCost(source[j-1],target[i-1])
distance[i][j] = min(distance[i-1][j]+1,
distance[i][j-1]+1,
distance[i-1][j-1]+sc)
if distance[i-1][j]+1 > distance[i-1][j-1]+sc and distance[i][j-1]+1 > distance[i-1][j-1]+sc:
path.append("S");
print path
# Return the minimum distance using all the table cells
return distance[i][j]
def subCost(x,y):
if x == y:
return 0
else:
return 2
def insertCost(x):
path.append("I")
return 1
def deleteCost(x):
path.append("D")
return 1
...
# Return the minimum distance using all the table cells
def backtrace(i, j):
if i>0 and j>0 and distance[i-1][j-1] + 2 == distance[i][j]:
return backtrace(i-1, j-1) + "S"
if i>0 and j>0 and distance[i-1][j-1] == distance[i][j]:
return backtrace(i-1, j-1) + "M"
if i>0 and distance[i-1][j] + 1 == distance[i][j]:
return backtrace(i-1, j) + "D"
if j>0 and distance[i][j-1] + 1 == distance[i][j]:
return backtrace(i, j-1) + "I"
return ""
return distance[i][j], backtrace(i, j)