在rstudio中有效地组合多个data.table排列
我有一个data.table,其中包含任意用户ID和测量的因变量:在rstudio中有效地组合多个data.table排列,r,data.table,R,Data.table,我有一个data.table,其中包含任意用户ID和测量的因变量: library(data.table) data <- data.table(user_id = round(runif(10,0,100),0), dv = round(runif(10, 0, 10),1)) 库(data.table) 数据我认为你不太可能发现巨大的速度提升,至少除非你有非常大的数据。但是如果您避免了cbind调用,那么您可以编写一个较短且速度不到两倍的版本
library(data.table)
data <- data.table(user_id = round(runif(10,0,100),0), dv = round(runif(10, 0, 10),1))
库(data.table)
数据我认为你不太可能发现巨大的速度提升,至少除非你有非常大的数据。但是如果您避免了cbind
调用,那么您可以编写一个较短且速度不到两倍的版本。我还包括一个稍微快一点的。注意,我将样本大小增加到200,以提供更有意义的基准
library(data.table)
N <- 200
data <- data.table(user_id = round(runif(N,0,100),0), dv = round(runif(N, 0, 10),1))
method1 <- function() {
for(n in 1:nrow(data)){
ifelse(n > 1,
data_combined <- rbindlist(list(data_combined,
cbind(data[1:n, ], N = n))),
data_combined <- cbind(data[1:n, ], N = n)
)
}
return(data_combined)
}
method2 <- function() {
data_combined <- lapply(1:nrow(data), function(i) data.table(data[1:i], N=i))
data_combined <- rbindlist(data_combined)
return(data_combined)
}
method3 <- function() {
data_combined <- list()
for (i in 1:nrow(data)) {
data_combined[[i]] <- data[1:i]
data_combined[[i]][, N := i]
}
data_combined <- rbindlist(data_combined)
return(data_combined)
}
基准:
library(microbenchmark)
microbenchmark(method1(), method2(), method3())
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> method1() 131.94106 150.40669 166.28043 157.02466 165.40358 408.1812 100 c
#> method2() 82.57354 100.94865 109.82735 106.36961 109.65404 202.5385 100 b
#> method3() 73.79046 90.20843 96.89102 96.14839 99.36719 193.2423 100 a
非常感谢。您的第三种方法正是我正在寻找的那种方法,我还没有完全弄清楚,因为我还在学习data.table包。虽然我也对通过尽可能消除“for”循环来加快迭代感兴趣,但放弃cbind&ifelse调用正是出于这个原因我感兴趣的其他选择。
library(data.table)
N <- 200
data <- data.table(user_id = round(runif(N,0,100),0), dv = round(runif(N, 0, 10),1))
method1 <- function() {
for(n in 1:nrow(data)){
ifelse(n > 1,
data_combined <- rbindlist(list(data_combined,
cbind(data[1:n, ], N = n))),
data_combined <- cbind(data[1:n, ], N = n)
)
}
return(data_combined)
}
method2 <- function() {
data_combined <- lapply(1:nrow(data), function(i) data.table(data[1:i], N=i))
data_combined <- rbindlist(data_combined)
return(data_combined)
}
method3 <- function() {
data_combined <- list()
for (i in 1:nrow(data)) {
data_combined[[i]] <- data[1:i]
data_combined[[i]][, N := i]
}
data_combined <- rbindlist(data_combined)
return(data_combined)
}
all(method1() == method2())
#> [1] TRUE
all(method2() == method3())
#> [1] TRUE
library(microbenchmark)
microbenchmark(method1(), method2(), method3())
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> method1() 131.94106 150.40669 166.28043 157.02466 165.40358 408.1812 100 c
#> method2() 82.57354 100.94865 109.82735 106.36961 109.65404 202.5385 100 b
#> method3() 73.79046 90.20843 96.89102 96.14839 99.36719 193.2423 100 a