前一个值不用于R中的循环计算_R_Loops_Recursion_Reduce

前一个值不用于R中的循环计算

r loops recursion

前一个值不用于R中的循环计算,r,loops,recursion,reduce,R,Loops,Recursion,Reduce,我是一名初级程序员。我在使用以前的值（如递归）进行循环计算时遇到问题。我的数据示例： dt <- data.table(a = c(0:4), b = c( 0, 1, 2, 1, 3)) dt我不认为这会更快，但是这里有一种方法不用显式循环就可以做到 dt[, y := purrr::accumulate2(a, b, function(last, a, b) (last + b)*a , .init = 0)[-1]]

我是一名初级程序员。我在使用以前的值（如递归）进行循环计算时遇到问题。我的数据示例：

 dt <- data.table(a = c(0:4), b = c( 0, 1, 2, 1, 3))

dt我不认为这会更快，但是这里有一种方法不用显式循环就可以做到
dt[, y := purrr::accumulate2(a, b, function(last, a, b) (last + b)*a
                             , .init = 0)[-1]]

dt      
#    a b  y
# 1: 0 0  0
# 2: 1 1  1
# 3: 2 2  6
# 4: 3 1 21
# 5: 4 3 96

这是一个基本的R解决方案
根据中的信息，加速的指示是使用矩阵或向量（而不是data.frame
表示data.table
）。因此，最好在计算df$y
之前进行以下预处理，即：

或者一个非递归函数（我想这将比递归方法快得多）

由于迭代依赖性，这种计算不能利用R的矢量化优势。但是，速度的放缓似乎确实来自于data.frame
或data.table
上的索引性能
有趣的是，我能够通过访问a
、b
和y
直接作为数值向量
（2*10^5行1000倍以上的优势）或矩阵
列（2*10^5行100倍以上的优势）来大大加快循环速度与作为数据.table
或数据.frame
中的列进行比较
这一古老的讨论可能仍会对这一相当令人惊讶的结果有所启发：
请注意，我还制作了一个不同的玩具data.frame
，因此我可以在不返回Inf
的情况下测试更大的示例，因为y
随着I
的增长而增长：
选项data.frame
（根据您的示例，嵌入data.frame
或data.table
中的数字向量）：
选项matrix
（data.frame
在循环之前转换为matrix
）：
一个选项是使用Rcpp
，因为这个递归方程很容易用C++编写：
library(Rcpp)
cppFunction("
NumericVector func(NumericVector b, NumericVector a) {
    int len = b.size();
    NumericVector y(len);

    for (int i = 1; i < len; i++) {
        y[i] = (y[i-1] + b[i]) * a[i];
    }

    return(y);
}
")
func(c( 0, 1, 2, 1, 3), c(0:4))
#[1]  0  1  6 21 96

库（Rcpp）
CPP功能（“
数值向量函数（数值向量b，数值向量a）{
int len=b.size（）；
数值向量y（len）；
对于（int i=1；i

定时代码：
vec_length <- 1e7
dt <- data.frame(a=1:vec_length, b=1:vec_length, y=0)
y <- as.numeric(dt$y)
a <- as.numeric(dt$a)
b <- as.numeric(dt$b)

system.time(for (i in 2:length(y)) {
    y[i] <- (y[i - 1] + b[i]) * a[i]
})
#   user  system elapsed 
#  19.22    0.06   19.44 

system.time(func(b, a))
#   user  system elapsed 
#   0.09    0.02    0.09 

在df
上的vec_长度只有20000行，您的第二个解决方案在我的机器上花费了27秒，使用下面我提供的答案中的玩具示例。通过矩阵或单个数字向量访问向量要快得多（0.02秒）。似乎索引data.frame或data.table是速度瓶颈，而不是loop@ThetaFC谢谢你的信息，现在我的答案更新了
f <- function(k) {
  if (k == 1) return(0)
  c(f(k-1),(tail(f(k-1),1) + b[k])*a[k])
}

df$y <- f(nrow(df))

g <- Vectorize(function(k) sum(rev(cumprod(rev(a[2:k])))*b[2:k]))

df$y <- g(seq(nrow(df)))

> df
  a b  y
1 0 0  0
2 1 1  1
3 2 2  6
4 3 1 21
5 4 3 96

vec_length <- 200000
dt <- data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0)
system.time(for (i in 2:nrow(dt)) {
  dt$y[i] <- (dt$y[i - 1] + dt$b[i]) * dt$a[i]
})
#user  system elapsed 
#79.39  146.30  225.78
#NOTE: Sorry, I didn't have the patience to let the data.table version finish for vec_length=2*10^5.  
tail(dt$y)
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674

vec_length <- 200000
dt <- data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0)
y <- as.numeric(dt$y)
a <- as.numeric(dt$a)
b <- as.numeric(dt$b)
system.time(for (i in 2:length(y)) {
  y[i] <- (y[i - 1] + b[i]) * a[i]
})
#user  system elapsed 
#0.03    0.00    0.03 
tail(y)
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674

vec_length <- 200000
dt <- as.matrix(data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0))
system.time(for (i in 2:nrow(dt)) {
  dt[i, 1] <- (dt[i - 1, 3] + dt[i, 2]) * dt[i, 1]
})
#user  system elapsed 
#0.67    0.01    0.69
tail(dt[,3])
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674
#NOTE: a matrix is actually a vector but with an additional attribute (it's "dim") that says how the "matrix" should be organized into rows and columns

vec_length <- 200000
dt <- data.frame(a=seq(from=0, to=1, length.out = vec_length), b=seq(from=0, to=-1, length.out = vec_length), y=0)
system.time(for (i in 2:nrow(dt)) {
    dt[i, 3] <- (dt[(i - 1), 3] + dt[i, 2]) * dt[i, 1]
})
#user  system elapsed 
#110.69    0.03  112.01 
tail(dt[,3])
#[1] -554.1953 -555.1842 -556.1758 -557.1702 -558.1674 -559.1674

library(Rcpp)
cppFunction("
NumericVector func(NumericVector b, NumericVector a) {
    int len = b.size();
    NumericVector y(len);

    for (int i = 1; i < len; i++) {
        y[i] = (y[i-1] + b[i]) * a[i];
    }

    return(y);
}
")
func(c( 0, 1, 2, 1, 3), c(0:4))
#[1]  0  1  6 21 96

vec_length <- 1e7
dt <- data.frame(a=1:vec_length, b=1:vec_length, y=0)
y <- as.numeric(dt$y)
a <- as.numeric(dt$a)
b <- as.numeric(dt$b)

system.time(for (i in 2:length(y)) {
    y[i] <- (y[i - 1] + b[i]) * a[i]
})
#   user  system elapsed 
#  19.22    0.06   19.44 

system.time(func(b, a))
#   user  system elapsed 
#   0.09    0.02    0.09