R 如何通过索引将不同大小的循环输出存储在初始化向量中

R 如何通过索引将不同大小的循环输出存储在初始化向量中,r,for-loop,vector,R,For Loop,Vector,问题陈述 假设您拥有以下数据: df <- data.frame(x = rep(0, 10), batch = rep(1:3,c(4,2,4))) x batch 1 0 1 2 0 1 3 0 1 4 0 1 5 0 2 6 0 2 7 0 3 8 0 3 9 0 3 10 0 3 输出 结果应该如下所示: x batch 1 0

问题陈述

假设您拥有以下数据:

df <- data.frame(x = rep(0, 10),
                 batch = rep(1:3,c(4,2,4)))

   x batch
1  0     1
2  0     1
3  0     1
4  0     1
5  0     2
6  0     2
7  0     3
8  0     3
9  0     3
10 0     3
输出

结果应该如下所示:

   x batch
1  0     1
2  1     1
3  1     1
4  0     1
5  1     2
6  1     2
7  1     3
8  0     3
9  1     3
10 1     3
[1] 0 1 1 0
[1] 1 1
[1] 1 0 1 1
其中,
out\x
的每次迭代如下所示:

   x batch
1  0     1
2  1     1
3  1     1
4  0     1
5  1     2
6  1     2
7  1     3
8  0     3
9  1     3
10 1     3
[1] 0 1 1 0
[1] 1 1
[1] 1 0 1 1
问题


在仍然使用base R的情况下,实现这一点的更快方法是什么?

一个解决方案是提醒自己,我可以使用向量索引向量

set.seed(2021)

for(i in seq_len(length(unique(df$batch)))){
  batch_val <- d[which(df$batch == i),]$batch
  #some algorithm to generate 1's and 0's, but using sample() here
  out_x <- sample(c(0,1), length(batch_val), replace = T)
  print(out_x)

  #save output
  idx <- which(df$batch == i)
  df$x[idx] <- out_x
}
set.seed(2021年)
对于(序列长度中的i(唯一(df$batch))){

批处理值使用
tapply
怎么样

out_x <- tapply(df$batch, df$batch, function(x) sample(c(0,1), length(x), replace = T))

#------
$`1`
[1] 0 1 1 1

$`2`
[1] 0 1

$`3`
[1] 1 1 1 1


定时测试:

microbenchmark::microbenchmark(f_loop(), f_apply())

#---------
Unit: microseconds
      expr     min       lq     mean  median      uq      max neval
  f_loop() 399.895 425.1975 442.7077 437.754 450.690  612.969   100
 f_apply() 100.449 106.9185 160.5557 110.913 114.909 4867.603   100
其中函数定义为

f_loop <- function(){
  
  idxb <- 1
  idxe <- length(df[which(df$batch == 1),]$batch)

  for(i in seq_len(length(unique(df$batch)))){
    
    batch_val <- df[which(df$batch == i),]$batch
    #some algorithm to generate 1's and 0's, but using sample() here
    out_x <- sample(c(0,1), length(batch_val), replace = T)
    #print(out_x)
    
    #save output
    df$x[idxb:idxe] <- out_x
    
    #update indices
    idxb <- idxb + length(out_x)
    
    if(i < length(unique(df$batch))) {
      idxe <- idxe + length(df[which(df$batch == i+1),]$batch) 
    }
  }
  
  return(df$x)
}


f_apply <- function() {
  unlist(tapply(df$batch, df$batch, function(x) sample(c(0,1), length(x), replace = T)))
}
f_循环
f_loop <- function(){
  
  idxb <- 1
  idxe <- length(df[which(df$batch == 1),]$batch)

  for(i in seq_len(length(unique(df$batch)))){
    
    batch_val <- df[which(df$batch == i),]$batch
    #some algorithm to generate 1's and 0's, but using sample() here
    out_x <- sample(c(0,1), length(batch_val), replace = T)
    #print(out_x)
    
    #save output
    df$x[idxb:idxe] <- out_x
    
    #update indices
    idxb <- idxb + length(out_x)
    
    if(i < length(unique(df$batch))) {
      idxe <- idxe + length(df[which(df$batch == i+1),]$batch) 
    }
  }
  
  return(df$x)
}


f_apply <- function() {
  unlist(tapply(df$batch, df$batch, function(x) sample(c(0,1), length(x), replace = T)))
}