R 使用多核和并行编程加速数据表组

R 使用多核和并行编程加速数据表组,r,data.table,mclapply,R,Data.table,Mclapply,我有一个很大的代码,聚合步骤是当前速度方面的瓶颈 在我的代码中,我希望加快数据分组的速度。我的数据的SNOTE(简单的非平凡示例)如下所示: library(data.table) a = sample(1:10000000, 50000000, replace = TRUE) b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE) d = sample(c("3m","2m2

我有一个很大的代码,聚合步骤是当前速度方面的瓶颈

在我的代码中,我希望加快数据分组的速度。我的数据的SNOTE(简单的非平凡示例)如下所示:

library(data.table)
a = sample(1:10000000, 50000000, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e)
system.time(c.dt <- dt[,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1], by=a)])
   user  system elapsed 
 60.107   3.143  63.534
库(data.table)
a=样本(1:10000000,50000000,替换=真)
b=样本(c(“3m”、“2m2d2m”、“3m2d1i3s2d”、“5m”、“4m”、“9m”、“1m”),50000000,替换为真)
d=样本(c(“3m”、“2m2d2m”、“3m2d1i3s2d”、“5m”、“4m”、“9m”、“1m”),50000000,替换为真)
e=a
dt=数据表(a=a,b=b,d=d,e=e)

system.time(c.dt如果您有多个可用的内核,为什么不利用这样一个事实:您可以使用data.table的键快速筛选和分组data.table中的行:

library(doMC)
registerDoMC(cores=4)


setkey(dt, "a")

finalRowOrderMatters = FALSE # FALSE can be faster
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=finalRowOrderMatters) %dopar% 
     dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]

请注意,如果唯一组的数量(即
长度(唯一(a))
)相对较小,删除
参数会更快。合并
参数,将结果返回到列表中,然后对结果调用
rbindlist
。在我对两个内核和8GB RAM进行的测试中,阈值大约为9000个唯一值。以下是我用来进行基准测试的内容:

# (otion a)
round(rowMeans(replicate(3, system.time({
# ------- #
  foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=FALSE) %dopar% 
     dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
# ------- #
}))), 3) 
# [1]  1.243 elapsed for N ==  1,000
# [1] 11.540 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
# [1] 57.404 elapsed for N == 50,000



# (otion b)
round(rowMeans(replicate(3, system.time({
# ------- #
    results <- 
      foreach(x=unique(dt[["a"]])) %dopar% 
         dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
    rbindlist(results)
# ------- #
}))), 3)
# [1]  1.117 elapsed for N ==  1,000
# [1] 10.567 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
# [1] 76.613 elapsed for N == 50,000


## And used the following to create the dt
N <- 5e4
set.seed(1)
a = sample(1:N, N*2, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e, key="a")
#(otion a)
圆形(行)表示(复制(3,系统时间({
# ------- #
foreach(x=unique(dt[[“a”]]),.combine=“rbind”,.inorder=FALSE)%dopar%
dt[(x),list(b=粘贴(b,collapse=“”),d=粘贴(d,collapse=“”),e=e[[1]])]
# ------- #
}))), 3) 
#[1]1.243在N==1000时经过
#[1]11.540在N==10000时经过,长度(唯一(dt[[“a”]])==8617
#[1]N==50000经过57.404
#(奥的松b)
圆形(行)表示(复制(3,系统时间({
# ------- #

结果你能用
数据并行聚合吗。表
?能

值得吗?。这是上一个答案没有强调的关键点

如中所述,在并行运行操作时,需要在分发之前制作副本(“块”)。这会降低速度。在某些情况下,当您无法使用
数据时。表
(例如,运行许多线性回归),它值得在核心之间拆分任务,但不值得聚合-至少在涉及
数据表时是这样

简而言之(除非另有证明),使用
data.table
进行聚合,不要再担心使用
doMC
数据可能会提高速度。table
在聚合方面比任何其他可用的方法都要快,即使它不是多核


下面是一些您可以自己运行的基准测试,用于比较
数据。使用
by
foreach
mclappy
进行内部聚合。 首先列出结果

#-----------------------------------------------

# TL;DR FINAL RESULTS (Best to Worst)
# 3 replications, N = 10000:
# (1)  0.007 -- data.table using `by`
# (2)  3.548 -- mclapply with rbindlist
# (3)  5.557 -- foreach with rbindlist
# (4)  5.959 -- foreach with .combine = "rbind"
# (5) 14.029 -- lapply

# ----------------------------------------------

library(data.table)

## And used the following to create the dt
N <- 1e4
set.seed(1)
a = sample(1:N, N*2, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e, key="a")
setkey(dt, "a")

# TEST AGGREGATION WITHOUT PARALLELIZATION ---------------------------
## using data.tables `by` to aggregate
round(rowMeans(replicate(3, system.time({
    dt[,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1], by=a)]
}))), 3)
# [1] 0.007 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617

## using `lapply`
round(rowMeans(replicate(3, system.time({
    results <- lapply(unique(dt[["a"]]), function(x) {
        dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1])]
    })
    rbindlist(results)
}))), 3)
# [1] 14.029 elapsed for N == 10,000

# USING `mclapply` FORKING ---------------------------------
## use mclapply
round(rowMeans(replicate(3, system.time({
    results <- mclapply(unique(dt[["a"]]),
    function(x) {
        dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
    }, mc.cores=4)
    rbindlist(results)
}))), 3)
# [1] 3.548 elapsed for N == 10,000


# PARALLELIZATION USING `doMC` PACKAGE ---------------------------------
library(doMC)
mc = 4
registerDoMC(cores=mc)
getDoParWorkers()
# [1] 4

## (option a) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
    foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=FALSE) %dopar%
    dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
}))), 3)
# [1] 5.959 elapsed for N == 10,000

## (option b) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
    results <-
      foreach(x=unique(dt[["a"]])) %dopar%
        dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
    rbindlist(results)
}))), 3)
# [1] 5.557 elapsed for N == 10,000

registerDoSEQ()
getDoParWorkers()
# [1] 1
#-----------------------------------------------
结果太长了,读不下去了
#3次复制,N=10000:
#(1)0.007——使用'by'的data.table`
#(2)3.548——带rbindlist的mclapply
#(3)5.557——带有rbindlist的foreach
#(4)5.959--foreach with.combine=“rbind”
#(5)14.029——搭接
# ----------------------------------------------
库(数据表)
##并使用以下内容创建dt

N请更具体地说明“连接”和“聚合”这两个词的含义意思是说。这些让人想到的功能有3个:
list
c
、和
paste
。这些代码的功能是什么。我们是从数据帧中提取列还是处理数据表?“block.read.parent.cifig”和其他输入变量的结构是什么?…更好地解释这个问题!(显然有人同意。这不是我的反对票。)@Dwin,谢谢!我不确定我在Q中是否解释得足够清楚,但基本的Q是如何加快大型数据表的聚合操作,如上面示例中的一个。还要记住,我可以使用多个核,因此可能有一些智能并行化的想法可以大大加快此类操作。希望这对我有所帮助,我添加了一个示例我没有否决投票。但我这样做的原因是你没有提供任何有关数据的信息。如果
read.index
是一个行索引,那么当然将每一行单独分组成一行会很慢。你将调用
paste
数百万次。你使用了
Rprof
?你使用了
verbose吗=是的
?而且你用的是“太慢”之类的词没有给出数字。事实上,我已经说服自己现在对它投了否决票。如果你改进这个问题,它可以被逆转。@Dnaiel这现在是一个很好的问题。+1.我会试着看一看。我猜一些回答者只是有新的问题源,所以为了得到更多的关注,可能是一个提供奖励的主意。@MattDowle非常感谢,我很高兴我改进了ved如此令人困惑的问题:-)不确定它有多好,但这是我正在处理的问题。我正在学习更多关于如何提出更好的问题,以便对我有好处。每个子进程是否需要复制完整的data.table,还是它们都访问“main”data.table对象?