Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/73.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/performance/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
提高R中(带权重的Levenshtein距离)脚本的性能_R_Performance_Levenshtein Distance_Stringdist - Fatal编程技术网

提高R中(带权重的Levenshtein距离)脚本的性能

提高R中(带权重的Levenshtein距离)脚本的性能,r,performance,levenshtein-distance,stringdist,R,Performance,Levenshtein Distance,Stringdist,我正在使用Levenshtein距离度量进行大量的字符串比较,但是因为我需要能够解释字符串潜在结构中的空间邻接,所以我不得不制作自己的脚本,其中包括一个权重函数 我现在的问题是我的脚本效率很低。我必须进行大约600000次比较,而完成脚本需要几个小时。因此,我正在寻找一种使我的脚本更有效率的方法,但作为一个自学成才的人,我不知道如何自己解决这个问题 以下是功能: zeros <- function(lengthA,lengthB){ m <- matrix(c(rep(0,len

我正在使用Levenshtein距离度量进行大量的字符串比较,但是因为我需要能够解释字符串潜在结构中的空间邻接,所以我不得不制作自己的脚本,其中包括一个权重函数

我现在的问题是我的脚本效率很低。我必须进行大约600000次比较,而完成脚本需要几个小时。因此,我正在寻找一种使我的脚本更有效率的方法,但作为一个自学成才的人,我不知道如何自己解决这个问题

以下是功能:

zeros <- function(lengthA,lengthB){
  m <- matrix(c(rep(0,lengthA*lengthB)),nrow=lengthA,ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights == TRUE){

    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                           "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
                      # the cost_weight matrix
    indexA <- which(cost_weight==A, arr.ind=TRUE)
    indexB <- which(cost_weight==B, arr.ind=TRUE)
    walk <- abs(indexA[1]-indexB[1])+abs(indexA[2]-indexB[2])
    w <- walk/max_walk
  }

  else {w <- 1}

  return(w)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  for (i in seq(to=nchar(A))){ 
    D[i + 1,1] <- D[i,1] + deletion * weight(As[i],Bs[1], weights)
  }
  for (j in seq(to=nchar(B))){ 
    D[1,j + 1] <- D[1,j] + insertion * weight(As[1],Bs[j], weights)
  }
  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
        } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight(As[i],Bs[j], weights),
                              D[i,j + 1] + deletion * weight(As[i],Bs[j], weights),
                              D[i,j]     + substitution * weight(As[i],Bs[j], weights))
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}

zeros以下代码已经有了一些改进(您的代码;计算结果与以前相同,与
stringdist
不同),但我相信它可以更加简化和加速

zeros <- function(lengthA,lengthB){
  m <- matrix(0, nrow=lengthA, ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights){
    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                            "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
    # the cost_weight matrix
    amats <- lapply(A, `==`, y=cost_weight)
    bmats <- lapply(B, `==`, y=cost_weight)
    walk <- mapply(function(a, b){
      sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
    }, amats, bmats)
    return(walk/max_walk)
  }
  else return(1)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  #browser()
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  weight.mat <- outer(As, Bs, weight, weights=weights)
  D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
  D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))

  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
      } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
                              D[i,j + 1] + deletion * weight.mat[i, j],
                              D[i,j]     + substitution * weight.mat[i, j])
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}

zero这两个距离也不一致:
levenshtein(A,B)
为11.25,
stringdist(A,B,method=“lv”)
为24。这就是问题所在,因为
levenshtein()
根据
weight()
函数下定义的
成本权重
矩阵对字母进行权重计算。
zeros <- function(lengthA,lengthB){
  m <- matrix(0, nrow=lengthA, ncol=lengthB)
  return(m)
}


weight <- function(A,B,weights){
  if (weights){
    # cost_weight defines the matrix structure of the AOI-placement
    cost_weight <- matrix(c("a","b","c","d","e","f","g","h","i","j","k","l",
                            "m","n","o","p","q","r","s","t","u","v","w","x"),
                          nrow=6)

    max_walk <- 8.00  # defined as the maximum posible distance between letters in 
    # the cost_weight matrix
    amats <- lapply(A, `==`, y=cost_weight)
    bmats <- lapply(B, `==`, y=cost_weight)
    walk <- mapply(function(a, b){
      sum(abs(which(a, arr.ind=TRUE) - which(b, arr.ind=TRUE)))
    }, amats, bmats)
    return(walk/max_walk)
  }
  else return(1)
}


dist <- function(A, B, insertion, deletion, substitution, weights=TRUE){
  #browser()
  D <- zeros(nchar(A)+1,nchar(B)+1)
  As <- strsplit(A,"")[[1]]
  Bs <- strsplit(B,"")[[1]]
  # filling out the matrix
  weight.mat <- outer(As, Bs, weight, weights=weights)
  D[,1] <- c(0, deletion * cumsum(weight.mat[, 1]))
  D[1,] <- c(0, insertion * cumsum(weight.mat[1,]))

  for (i in seq(to=nchar(A))){ 
    for (j in seq(to=nchar(B))){
      if (As[i] == Bs[j]){
        D[i + 1,j + 1] <- D[i,j]
      } 
      else{
        D[i + 1,j + 1] <- min(D[i + 1,j] + insertion * weight.mat[i, j],
                              D[i,j + 1] + deletion * weight.mat[i, j],
                              D[i,j]     + substitution * weight.mat[i, j])
      }
    }
  }
  return(D)
}


levenshtein <- function(A, B, insertion=1, deletion=1, substitution=1){
  # Compute levenshtein distance between iterables A and B

  if (nchar(A) == nchar(B) & A == B){
    return(0)
  }

  if (nchar(B) > nchar(A)){
    C <- A
    A <- B
    B <- A
    #(A, B) <- (B, A)
  }

  if (nchar(A) == 0){
    return (nchar(B))
  }

  else{
    return (dist(A, B, insertion, deletion, substitution)[nchar(A),nchar(B)])
  }
}