Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/oop/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Optimization R中Verhoeff算法的优化_Optimization_R_Vectorization_Check Digit - Fatal编程技术网

Optimization R中Verhoeff算法的优化

Optimization R中Verhoeff算法的优化,optimization,r,vectorization,check-digit,Optimization,R,Vectorization,Check Digit,我编写了以下函数来计算R中的校验位 verhoeffCheck <- function(x) { ## calculates check digit based on Verhoeff algorithm ## note that due to the way strsplit works, to call for vector x, use sapply(x,verhoeffCheck) ## check for string since leading zeros with numb

我编写了以下函数来计算R中的校验位

verhoeffCheck <- function(x)
{
## calculates check digit based on Verhoeff algorithm
## note that due to the way strsplit works, to call for vector x, use sapply(x,verhoeffCheck)

## check for string since leading zeros with numbers will be lost
if (class(x)!="character"){stop("Must enter a string")}

#split and convert to numbers
digs <- strsplit(x,"")[[1]]
digs <- as.numeric(digs)

digs <- rev(digs)   ## right to left algorithm

## tables required for D_5 group

d5_mult <- matrix(c(
                 0:9,
                 c(1:4,0,6:9,5),
                 c(2:4,0:1,7:9,5:6),
                 c(3:4,0:2,8:9,5:7),
                 c(4,0:3,9,5:8),
                 c(5,9:6,0,4:1),
                 c(6:5,9:7,1:0,4:2),
                 c(7:5,9:8,2:0,4:3),
                 c(8:5,9,3:0,4),
                 9:0
                 ),10,10,byrow=T)

d5_perm <- matrix(c(
                 0:9,
                 c(1,5,7,6,2,8,3,0,9,4),
                 c(5,8,0,3,7,9,6,1,4,2),
                 c(8,9,1,6,0,4,3,5,2,7),
                 c(9,4,5,3,1,2,6,8,7,0),
                 c(4,2,8,6,5,7,3,9,0,1),
                 c(2,7,9,3,8,0,6,4,1,5),
                 c(7,0,4,6,9,1,3,2,5,8)
                 ),8,10,byrow=T)

d5_inv <- c(0,4:1,5:9)

## apply algoritm - note 1-based indexing in R
d <- 0

for (i in 1:length(digs)){
    d <- d5_mult[d+1,(d5_perm[(i%%8)+1,digs[i]+1])+1]
    }

d5_inv[d+1]
}
为了在字符串向量上运行,必须使用sapply。这部分是因为使用了strsplit,它返回一个向量列表。即使只有中等大小的输入,这也会影响性能

如何将此函数矢量化


我还意识到,在每次迭代中必须创建表,这会损失一些性能。将这些存储在新环境中是更好的解决方案吗?

如果您的输入字符串可以包含不同数量的字符,那么我看不出有任何方法可以绕过Lappy调用或plyr等价物。诀窍是将它们移动到函数内部,以便VerhoefCheck可以接受向量输入。这样,您只需要创建一次矩阵

verhoeffCheckNew <- function(x)
{
## calculates check digit based on Verhoeff algorithm

## check for string since leading zeros with numbers will be lost
  if (!is.character(x)) stop("Must enter a string")

  #split and convert to numbers
  digs <- strsplit(x, "")
  digs <- lapply(digs, function(x) rev(as.numeric(x)))

  ## tables required for D_5 group
  d5_mult <- matrix(c(
                   0:9,
                   c(1:4,0,6:9,5),
                   c(2:4,0:1,7:9,5:6),
                   c(3:4,0:2,8:9,5:7),
                   c(4,0:3,9,5:8),
                   c(5,9:6,0,4:1),
                   c(6:5,9:7,1:0,4:2),
                   c(7:5,9:8,2:0,4:3),
                   c(8:5,9,3:0,4),
                   9:0
                   ),10,10,byrow=T)

  d5_perm <- matrix(c(
                   0:9,
                   c(1,5,7,6,2,8,3,0,9,4),
                   c(5,8,0,3,7,9,6,1,4,2),
                   c(8,9,1,6,0,4,3,5,2,7),
                   c(9,4,5,3,1,2,6,8,7,0),
                   c(4,2,8,6,5,7,3,9,0,1),
                   c(2,7,9,3,8,0,6,4,1,5),
                   c(7,0,4,6,9,1,3,2,5,8)
                   ),8,10,byrow=T)

  d5_inv <- c(0,4:1,5:9)

  ## apply algorithm - note 1-based indexing in R      
  sapply(digs, function(x)
  {
    d <- 0  
    for (i in 1:length(x)){
        d <- d5_mult[d + 1, (d5_perm[(i %% 8) + 1, x[i] + 1]) + 1]
        }  
    d5_inv[d+1]
  })
}
有关tic和toc,请参阅

进一步思考:

您可能需要对转换为数值时返回NA的字符串和其他字符串进行额外的输入检查


由于您只处理整数,因此使用整数而不是双倍整数可能会带来一些性能上的好处。使用as.integer而不是as.numeric,并将L附加到矩阵中的值。

Richie C很好地回答了矢量化问题;至于只创建一次表而不混乱全局名称空间,一个不需要包的快速解决方案是

verhoeffCheck <- local(function(x)
{
## calculates check digit based on Verhoeff algorithm
## note that due to the way strsplit works, to call for vector x, use sapply(x,verhoeffCheck)
## check for string since leading zeros with numbers will be lost
if (class(x)!="character"){stop("Must enter a string")}
#split and convert to numbers
digs <- strsplit(x,"")[[1]]
digs <- as.numeric(digs)
digs <- rev(digs)   ## right to left algorithm
## apply algoritm - note 1-based indexing in R
d <- 0
for (i in 1:length(digs)){
    d <- d5_mult[d+1,(d5_perm[(i%%8)+1,digs[i]+1])+1]
    }
d5_inv[d+1]
})

assign("d5_mult", matrix(c(
    0:9, c(1:4,0,6:9,5), c(2:4,0:1,7:9,5:6), c(3:4,0:2,8:9,5:7),
    c(4,0:3,9,5:8), c(5,9:6,0,4:1), c(6:5,9:7,1:0,4:2), c(7:5,9:8,2:0,4:3),
    c(8:5,9,3:0,4), 9:0), 10, 10, byrow = TRUE), 
    envir = environment(verhoeffCheck))

assign("d5_perm", matrix(c(
    0:9, c(1,5,7,6,2,8,3,0,9,4), c(5,8,0,3,7,9,6,1,4,2),
    c(8,9,1,6,0,4,3,5,2,7), c(9,4,5,3,1,2,6,8,7,0), c(4,2,8,6,5,7,3,9,0,1),
    c(2,7,9,3,8,0,6,4,1,5), c(7,0,4,6,9,1,3,2,5,8)), 8, 10, byrow = TRUE),
    envir = environment(verhoeffCheck))

assign("d5_inv", c(0,4:1,5:9), envir = environment(verhoeffCheck))
## Now just use the function
它将数据保存在函数的环境中。你可以给它计时,看看它有多快

希望这有帮助


Allan

我们首先定义查找矩阵。我已经在某种程度上把它们布置好了 这将使他们更容易对照参考资料进行检查,例如。

digits2比digits快得多,但对用户的影响有限 整个运行时

verhoeff2 <- function(x) {
  digs <- digits2(x)

  c <- 0
  for (i in 1:length(digs)) {
    c <- d(c, p(i, digs[i]))
  }

  d5_inv[c + 1]
}
verhoeff2(142857)

## [1] 0

microbenchmark(
  verhoeff(142857),
  verhoeff2(142857)
)

## Unit: microseconds
##               expr   min    lq median    uq   max neval
##   verhoeff(142857) 33.06 34.49  35.19 35.92 73.38   100
##  verhoeff2(142857) 20.98 22.58  24.05 25.28 48.69   100
< >为了使它更快,我们可以尝试C++。< /P>
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
int verhoeff3_c(IntegerVector digits, IntegerMatrix mult, IntegerMatrix perm,
                IntegerVector inv) {
  int n = digits.size();
  int c = 0;

  for(int i = 0; i < n; ++i) {
    int p = perm(i % 8, digits[i]);
    c = mult(c, p);
  }

  return inv[c];
}

verhoeff3 <- function(x) {
  verhoeff3_c(digits(x), d5_mult, d5_perm, d5_inv)
}
verhoeff3(142857)

## [1] 3

microbenchmark(
  verhoeff2(142857),
  verhoeff3(142857)
)

## Unit: microseconds
##               expr   min    lq median    uq   max neval
##  verhoeff2(142857) 21.00 22.85  25.53 27.11 63.71   100
##  verhoeff3(142857) 16.75 17.99  18.87 19.64 79.54   100
这并没有带来多大的改善。也许我们可以做得更好,如果我们 将数字传递给C++,并处理循环中的位数:

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
int verhoeff4_c(int number, IntegerMatrix mult, IntegerMatrix perm,
                IntegerVector inv) {
  int c = 0;
  int i = 0;

  for (int i = 0; number > 0; ++i, number /= 10) {
    int p = perm(i % 8, number % 10);
    c = mult(c, p);
  }

  return inv[c];
}

verhoeff4 <- function(x) {
  verhoeff4_c(x, d5_mult, d5_perm, d5_inv)
}
verhoeff4(142857)

## [1] 3

microbenchmark(
  verhoeff2(142857),
  verhoeff3(142857),
  verhoeff4(142857)
)

## Unit: microseconds
##               expr    min     lq median     uq   max neval
##  verhoeff2(142857) 21.808 24.910 26.838 27.797 64.22   100
##  verhoeff3(142857) 17.699 18.742 19.599 20.764 81.67   100
##  verhoeff4(142857)  3.143  3.797  4.095  4.396 13.21   100
我们得到了回报:verhoeff4的速度大约是它的5倍
非常好!我发现了一个类似的加速。在as.numeric中包装最后一个lappy可以确保返回的是向量而不是列表。@詹姆斯:使用sapply而不是lappy可以为您做到这一点,而不需要as.numeric。谢谢Hadley,做得好!唯一的问题是,一般来说,前导零很重要,因此输入不能像digits2、verhoeff3和verhoeff4那样,通过对分割数字向量进行填充来转换为数字。此外,使用整数会限制输入的长度:对于32位整数,只有8位数字是安全的。条形码虽然使用不同的方案,但它有12位数字加上一个校验位。@杰姆斯,该策略仍将与数字向量一起工作,以增加范围,或者我希望C++可以从字符串中快速地从字符中循环。
library(microbenchmark)
digits <- function(x) {
  digs <- strsplit(as.character(x), "")[[1]]
  digs <- as.numeric(digs)
  rev(digs)
}

microbenchmark(
  digits(142857),
  verhoeff(142857)
)

## Unit: microseconds
##              expr   min    lq median    uq   max neval
##    digits(142857) 11.30 12.01  12.43 12.85 28.79   100
##  verhoeff(142857) 32.24 33.81  34.66 35.47 95.85   100
digits2 <- function(x) {
   n <- floor(log10(x))
   x %/% 10^(0:n) %% 10
}
digits2(12345)

## [1] 5 4 3 2 1

microbenchmark(
  digits(142857),
  digits2(142857)
)

## Unit: microseconds
##             expr    min     lq median     uq   max neval
##   digits(142857) 11.495 12.102 12.468 12.834 79.60   100
##  digits2(142857)  2.322  2.784  3.358  3.561 13.69   100
verhoeff2 <- function(x) {
  digs <- digits2(x)

  c <- 0
  for (i in 1:length(digs)) {
    c <- d(c, p(i, digs[i]))
  }

  d5_inv[c + 1]
}
verhoeff2(142857)

## [1] 0

microbenchmark(
  verhoeff(142857),
  verhoeff2(142857)
)

## Unit: microseconds
##               expr   min    lq median    uq   max neval
##   verhoeff(142857) 33.06 34.49  35.19 35.92 73.38   100
##  verhoeff2(142857) 20.98 22.58  24.05 25.28 48.69   100
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
int verhoeff3_c(IntegerVector digits, IntegerMatrix mult, IntegerMatrix perm,
                IntegerVector inv) {
  int n = digits.size();
  int c = 0;

  for(int i = 0; i < n; ++i) {
    int p = perm(i % 8, digits[i]);
    c = mult(c, p);
  }

  return inv[c];
}

verhoeff3 <- function(x) {
  verhoeff3_c(digits(x), d5_mult, d5_perm, d5_inv)
}
verhoeff3(142857)

## [1] 3

microbenchmark(
  verhoeff2(142857),
  verhoeff3(142857)
)

## Unit: microseconds
##               expr   min    lq median    uq   max neval
##  verhoeff2(142857) 21.00 22.85  25.53 27.11 63.71   100
##  verhoeff3(142857) 16.75 17.99  18.87 19.64 79.54   100
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
int verhoeff4_c(int number, IntegerMatrix mult, IntegerMatrix perm,
                IntegerVector inv) {
  int c = 0;
  int i = 0;

  for (int i = 0; number > 0; ++i, number /= 10) {
    int p = perm(i % 8, number % 10);
    c = mult(c, p);
  }

  return inv[c];
}

verhoeff4 <- function(x) {
  verhoeff4_c(x, d5_mult, d5_perm, d5_inv)
}
verhoeff4(142857)

## [1] 3

microbenchmark(
  verhoeff2(142857),
  verhoeff3(142857),
  verhoeff4(142857)
)

## Unit: microseconds
##               expr    min     lq median     uq   max neval
##  verhoeff2(142857) 21.808 24.910 26.838 27.797 64.22   100
##  verhoeff3(142857) 17.699 18.742 19.599 20.764 81.67   100
##  verhoeff4(142857)  3.143  3.797  4.095  4.396 13.21   100