提高R中（带权重的Levenshtein距离）脚本的性能_R_Performance_Levenshtein Distance_Stringdist

提高R中（带权重的Levenshtein距离）脚本的性能

r performance

提高R中（带权重的Levenshtein距离）脚本的性能,r,performance,levenshtein-distance,stringdist,R,Performance,Levenshtein Distance,Stringdist,我正在使用Levenshtein距离度量进行大量的字符串比较，但是因为我需要能够解释字符串潜在结构中的空间邻接，所以我不得不制作自己的脚本，其中包括一个权重函数我现在的问题是我的脚本效率很低。我必须进行大约600000次比较，而完成脚本需要几个小时。因此，我正在寻找一种使我的脚本更有效率的方法，但作为一个自学成才的人，我不知道如何自己解决这个问题以下是功能： zeros <- function(lengthA,lengthB){ m <- matrix(c(rep(0,len

我正在使用Levenshtein距离度量进行大量的字符串比较，但是因为我需要能够解释字符串潜在结构中的空间邻接，所以我不得不制作自己的脚本，其中包括一个权重函数

我现在的问题是我的脚本效率很低。我必须进行大约600000次比较，而完成脚本需要几个小时。因此，我正在寻找一种使我的脚本更有效率的方法，但作为一个自学成才的人，我不知道如何自己解决这个问题

以下是功能：

zeros <- function(lengthA,lengthB){
  m <- matrix(c(rep(0,lengthA*lengthB)),nrow=lengthA,ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights == TRUE){

    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                           "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
                      # the cost_weight matrix
    indexA <- which(cost_weight==A, arr.ind=TRUE)
    indexB <- which(cost_weight==B, arr.ind=TRUE)
    walk <- abs(indexA[1]-indexB[1])+abs(indexA[2]-indexB[2])
    w <- walk/max_walk
  }

  else {w <- 1}

  return(w)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  for (i in seq(to=nchar(A))){ 
    D[i + 1,1] <- D[i,1] + deletion * weight(As[i],Bs[1], weights)
  }
  for (j in seq(to=nchar(B))){ 
    D[1,j + 1] <- D[1,j] + insertion * weight(As[1],Bs[j], weights)
  }
  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
        } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight(As[i],Bs[j], weights),
                              D[i,j + 1] + deletion * weight(As[i],Bs[j], weights),
                              D[i,j]     + substitution * weight(As[i],Bs[j], weights))
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}

zeros以下代码已经有了一些改进（您的代码；计算结果与以前相同，与stringdist
不同），但我相信它可以更加简化和加速
zeros <- function(lengthA,lengthB){
  m <- matrix(0, nrow=lengthA, ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights){
    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                            "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
    # the cost_weight matrix
    amats <- lapply(A, `==`, y=cost_weight)
    bmats <- lapply(B, `==`, y=cost_weight)
    walk <- mapply(function(a, b){
      sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
    }, amats, bmats)
    return(walk/max_walk)
  }
  else return(1)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  #browser()
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  weight.mat <- outer(As, Bs, weight, weights=weights)
  D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
  D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))

  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
      } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
                              D[i,j + 1] + deletion * weight.mat[i, j],
                              D[i,j]     + substitution * weight.mat[i, j])
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}

zero这两个距离也不一致：levenshtein（A，B）
为11.25，stringdist（A，B，method=“lv”）
为24。这就是问题所在，因为levenshtein（）
根据weight（）
函数下定义的成本权重矩阵对字母进行权重计算。
zeros <- function(lengthA,lengthB){
  m <- matrix(0, nrow=lengthA, ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights){
    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                            "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
    # the cost_weight matrix
    amats <- lapply(A, `==`, y=cost_weight)
    bmats <- lapply(B, `==`, y=cost_weight)
    walk <- mapply(function(a, b){
      sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
    }, amats, bmats)
    return(walk/max_walk)
  }
  else return(1)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  #browser()
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  weight.mat <- outer(As, Bs, weight, weights=weights)
  D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
  D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))

  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
      } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
                              D[i,j + 1] + deletion * weight.mat[i, j],
                              D[i,j]     + substitution * weight.mat[i, j])
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}