R 使用多核和并行编程加速数据表组
我有一个很大的代码,聚合步骤是当前速度方面的瓶颈 在我的代码中,我希望加快数据分组的速度。我的数据的SNOTE(简单的非平凡示例)如下所示:R 使用多核和并行编程加速数据表组,r,data.table,mclapply,R,Data.table,Mclapply,我有一个很大的代码,聚合步骤是当前速度方面的瓶颈 在我的代码中,我希望加快数据分组的速度。我的数据的SNOTE(简单的非平凡示例)如下所示: library(data.table) a = sample(1:10000000, 50000000, replace = TRUE) b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE) d = sample(c("3m","2m2
library(data.table)
a = sample(1:10000000, 50000000, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e)
system.time(c.dt <- dt[,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1], by=a)])
user system elapsed
60.107 3.143 63.534
库(data.table)
a=样本(1:10000000,50000000,替换=真)
b=样本(c(“3m”、“2m2d2m”、“3m2d1i3s2d”、“5m”、“4m”、“9m”、“1m”),50000000,替换为真)
d=样本(c(“3m”、“2m2d2m”、“3m2d1i3s2d”、“5m”、“4m”、“9m”、“1m”),50000000,替换为真)
e=a
dt=数据表(a=a,b=b,d=d,e=e)
system.time(c.dt如果您有多个可用的内核,为什么不利用这样一个事实:您可以使用data.table的键快速筛选和分组data.table中的行:
library(doMC)
registerDoMC(cores=4)
setkey(dt, "a")
finalRowOrderMatters = FALSE # FALSE can be faster
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=finalRowOrderMatters) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
请注意,如果唯一组的数量(即长度(唯一(a))
)相对较小,删除参数会更快。合并
参数,将结果返回到列表中,然后对结果调用rbindlist
。在我对两个内核和8GB RAM进行的测试中,阈值大约为9000个唯一值。以下是我用来进行基准测试的内容:
# (otion a)
round(rowMeans(replicate(3, system.time({
# ------- #
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=FALSE) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
# ------- #
}))), 3)
# [1] 1.243 elapsed for N == 1,000
# [1] 11.540 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
# [1] 57.404 elapsed for N == 50,000
# (otion b)
round(rowMeans(replicate(3, system.time({
# ------- #
results <-
foreach(x=unique(dt[["a"]])) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
rbindlist(results)
# ------- #
}))), 3)
# [1] 1.117 elapsed for N == 1,000
# [1] 10.567 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
# [1] 76.613 elapsed for N == 50,000
## And used the following to create the dt
N <- 5e4
set.seed(1)
a = sample(1:N, N*2, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e, key="a")
#(otion a)
圆形(行)表示(复制(3,系统时间({
# ------- #
foreach(x=unique(dt[[“a”]]),.combine=“rbind”,.inorder=FALSE)%dopar%
dt[(x),list(b=粘贴(b,collapse=“”),d=粘贴(d,collapse=“”),e=e[[1]])]
# ------- #
}))), 3)
#[1]1.243在N==1000时经过
#[1]11.540在N==10000时经过,长度(唯一(dt[[“a”]])==8617
#[1]N==50000经过57.404
#(奥的松b)
圆形(行)表示(复制(3,系统时间({
# ------- #
结果你能用数据并行聚合吗。表?能
值得吗?不。这是上一个答案没有强调的关键点
如中所述,在并行运行操作时,需要在分发之前制作副本(“块”)。这会降低速度。在某些情况下,当您无法使用数据时。表
(例如,运行许多线性回归),它值得在核心之间拆分任务,但不值得聚合-至少在涉及数据表时是这样
简而言之(除非另有证明),使用data.table
进行聚合,不要再担心使用doMC
数据可能会提高速度。table
在聚合方面比任何其他可用的方法都要快,即使它不是多核
下面是一些您可以自己运行的基准测试,用于比较数据。使用by
与foreach
和mclappy
进行内部聚合。
首先列出结果
#-----------------------------------------------
# TL;DR FINAL RESULTS (Best to Worst)
# 3 replications, N = 10000:
# (1) 0.007 -- data.table using `by`
# (2) 3.548 -- mclapply with rbindlist
# (3) 5.557 -- foreach with rbindlist
# (4) 5.959 -- foreach with .combine = "rbind"
# (5) 14.029 -- lapply
# ----------------------------------------------
library(data.table)
## And used the following to create the dt
N <- 1e4
set.seed(1)
a = sample(1:N, N*2, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e, key="a")
setkey(dt, "a")
# TEST AGGREGATION WITHOUT PARALLELIZATION ---------------------------
## using data.tables `by` to aggregate
round(rowMeans(replicate(3, system.time({
dt[,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1], by=a)]
}))), 3)
# [1] 0.007 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
## using `lapply`
round(rowMeans(replicate(3, system.time({
results <- lapply(unique(dt[["a"]]), function(x) {
dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1])]
})
rbindlist(results)
}))), 3)
# [1] 14.029 elapsed for N == 10,000
# USING `mclapply` FORKING ---------------------------------
## use mclapply
round(rowMeans(replicate(3, system.time({
results <- mclapply(unique(dt[["a"]]),
function(x) {
dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
}, mc.cores=4)
rbindlist(results)
}))), 3)
# [1] 3.548 elapsed for N == 10,000
# PARALLELIZATION USING `doMC` PACKAGE ---------------------------------
library(doMC)
mc = 4
registerDoMC(cores=mc)
getDoParWorkers()
# [1] 4
## (option a) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=FALSE) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
}))), 3)
# [1] 5.959 elapsed for N == 10,000
## (option b) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
results <-
foreach(x=unique(dt[["a"]])) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
rbindlist(results)
}))), 3)
# [1] 5.557 elapsed for N == 10,000
registerDoSEQ()
getDoParWorkers()
# [1] 1
#-----------------------------------------------
结果太长了,读不下去了
#3次复制,N=10000:
#(1)0.007——使用'by'的data.table`
#(2)3.548——带rbindlist的mclapply
#(3)5.557——带有rbindlist的foreach
#(4)5.959--foreach with.combine=“rbind”
#(5)14.029——搭接
# ----------------------------------------------
库(数据表)
##并使用以下内容创建dt
N请更具体地说明“连接”和“聚合”这两个词的含义意思是说。这些让人想到的功能有3个:list
、c
、和paste
。这些代码的功能是什么。我们是从数据帧中提取列还是处理数据表?“block.read.parent.cifig”和其他输入变量的结构是什么?…更好地解释这个问题!(显然有人同意。这不是我的反对票。)@Dwin,谢谢!我不确定我在Q中是否解释得足够清楚,但基本的Q是如何加快大型数据表的聚合操作,如上面示例中的一个。还要记住,我可以使用多个核,因此可能有一些智能并行化的想法可以大大加快此类操作。希望这对我有所帮助,我添加了一个示例我没有否决投票。但我这样做的原因是你没有提供任何有关数据的信息。如果read.index
是一个行索引,那么当然将每一行单独分组成一行会很慢。你将调用paste
数百万次。你使用了Rprof
?你使用了verbose吗=是的
?而且你用的是“太慢”之类的词没有给出数字。事实上,我已经说服自己现在对它投了否决票。如果你改进这个问题,它可以被逆转。@Dnaiel这现在是一个很好的问题。+1.我会试着看一看。我猜一些回答者只是有新的问题源,所以为了得到更多的关注,可能是一个提供奖励的主意。@MattDowle非常感谢,我很高兴我改进了ved如此令人困惑的问题:-)不确定它有多好,但这是我正在处理的问题。我正在学习更多关于如何提出更好的问题,以便对我有好处。每个子进程是否需要复制完整的data.table,还是它们都访问“main”data.table对象?