提高R中(带权重的Levenshtein距离)脚本的性能
我正在使用Levenshtein距离度量进行大量的字符串比较,但是因为我需要能够解释字符串潜在结构中的空间邻接,所以我不得不制作自己的脚本,其中包括一个权重函数 我现在的问题是我的脚本效率很低。我必须进行大约600000次比较,而完成脚本需要几个小时。因此,我正在寻找一种使我的脚本更有效率的方法,但作为一个自学成才的人,我不知道如何自己解决这个问题 以下是功能:提高R中(带权重的Levenshtein距离)脚本的性能,r,performance,levenshtein-distance,stringdist,R,Performance,Levenshtein Distance,Stringdist,我正在使用Levenshtein距离度量进行大量的字符串比较,但是因为我需要能够解释字符串潜在结构中的空间邻接,所以我不得不制作自己的脚本,其中包括一个权重函数 我现在的问题是我的脚本效率很低。我必须进行大约600000次比较,而完成脚本需要几个小时。因此,我正在寻找一种使我的脚本更有效率的方法,但作为一个自学成才的人,我不知道如何自己解决这个问题 以下是功能: zeros <- function(lengthA,lengthB){ m <- matrix(c(rep(0,len
zeros <- function(lengthA,lengthB){
m <- matrix(c(rep(0,lengthA*lengthB)),nrow=lengthA,ncol=lengthB)
return(m)
}
weight <- function(A,B,weights){
if (weights == TRUE){
# cost_weight defines the matrix structure of the AOI-placement
cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
"m","n","o","p","q","r","s","t","u","v","w","x"),
nrow=6)
max_walk <- 8.00 # defined as the maximum posible distance between letters in
# the cost_weight matrix
indexA <- which(cost_weight==A, arr.ind=TRUE)
indexB <- which(cost_weight==B, arr.ind=TRUE)
walk <- abs(indexA[1]-indexB[1])+abs(indexA[2]-indexB[2])
w <- walk/max_walk
}
else {w <- 1}
return(w)
}
dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
D <- zeros(nchar(A)+1,nchar(B)+1)
As <- strsplit(A,"")[[1]]
Bs <- strsplit(B,"")[[1]]
# filling out the matrix
for (i in seq(to=nchar(A))){
D[i + 1,1] <- D[i,1] + deletion * weight(As[i],Bs[1], weights)
}
for (j in seq(to=nchar(B))){
D[1,j + 1] <- D[1,j] + insertion * weight(As[1],Bs[j], weights)
}
for (i in seq(to=nchar(A))){
for (j in seq(to=nchar(B))){
if (As[i] == Bs[j]){
D[i + 1,j + 1] <- D[i,j]
}
else{
D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight(As[i],Bs[j], weights),
D[i,j + 1] + deletion * weight(As[i],Bs[j], weights),
D[i,j] + substitution * weight(As[i],Bs[j], weights))
}
}
}
return(D)
}
levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
# Compute levenshtein distance between iterables A and B
if (nchar(A) == nchar(B) & A == B){
return(0)
}
if (nchar(B) > nchar(A)){
C <- A
A <- B
B <- A
#(A, B) <- (B, A)
}
if (nchar(A) == 0){
return (nchar(B))
}
else{
return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
}
}
zeros以下代码已经有了一些改进(您的代码;计算结果与以前相同,与stringdist
不同),但我相信它可以更加简化和加速
zeros <- function(lengthA,lengthB){
m <- matrix(0, nrow=lengthA, ncol=lengthB)
return(m)
}
weight <- function(A,B,weights){
if (weights){
# cost_weight defines the matrix structure of the AOI-placement
cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
"m","n","o","p","q","r","s","t","u","v","w","x"),
nrow=6)
max_walk <- 8.00 # defined as the maximum posible distance between letters in
# the cost_weight matrix
amats <- lapply(A, `==`, y=cost_weight)
bmats <- lapply(B, `==`, y=cost_weight)
walk <- mapply(function(a, b){
sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
}, amats, bmats)
return(walk/max_walk)
}
else return(1)
}
dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
#browser()
D <- zeros(nchar(A)+1,nchar(B)+1)
As <- strsplit(A,"")[[1]]
Bs <- strsplit(B,"")[[1]]
# filling out the matrix
weight.mat <- outer(As, Bs, weight, weights=weights)
D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))
for (i in seq(to=nchar(A))){
for (j in seq(to=nchar(B))){
if (As[i] == Bs[j]){
D[i + 1,j + 1] <- D[i,j]
}
else{
D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
D[i,j + 1] + deletion * weight.mat[i, j],
D[i,j] + substitution * weight.mat[i, j])
}
}
}
return(D)
}
levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
# Compute levenshtein distance between iterables A and B
if (nchar(A) == nchar(B) & A == B){
return(0)
}
if (nchar(B) > nchar(A)){
C <- A
A <- B
B <- A
#(A, B) <- (B, A)
}
if (nchar(A) == 0){
return (nchar(B))
}
else{
return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
}
}
zero这两个距离也不一致:levenshtein(A,B)
为11.25,stringdist(A,B,method=“lv”)
为24。这就是问题所在,因为levenshtein()
根据weight()
函数下定义的成本权重
矩阵对字母进行权重计算。
zeros <- function(lengthA,lengthB){
m <- matrix(0, nrow=lengthA, ncol=lengthB)
return(m)
}
weight <- function(A,B,weights){
if (weights){
# cost_weight defines the matrix structure of the AOI-placement
cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
"m","n","o","p","q","r","s","t","u","v","w","x"),
nrow=6)
max_walk <- 8.00 # defined as the maximum posible distance between letters in
# the cost_weight matrix
amats <- lapply(A, `==`, y=cost_weight)
bmats <- lapply(B, `==`, y=cost_weight)
walk <- mapply(function(a, b){
sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
}, amats, bmats)
return(walk/max_walk)
}
else return(1)
}
dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
#browser()
D <- zeros(nchar(A)+1,nchar(B)+1)
As <- strsplit(A,"")[[1]]
Bs <- strsplit(B,"")[[1]]
# filling out the matrix
weight.mat <- outer(As, Bs, weight, weights=weights)
D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))
for (i in seq(to=nchar(A))){
for (j in seq(to=nchar(B))){
if (As[i] == Bs[j]){
D[i + 1,j + 1] <- D[i,j]
}
else{
D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
D[i,j + 1] + deletion * weight.mat[i, j],
D[i,j] + substitution * weight.mat[i, j])
}
}
}
return(D)
}
levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
# Compute levenshtein distance between iterables A and B
if (nchar(A) == nchar(B) & A == B){
return(0)
}
if (nchar(B) > nchar(A)){
C <- A
A <- B
B <- A
#(A, B) <- (B, A)
}
if (nchar(A) == 0){
return (nchar(B))
}
else{
return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
}
}