加速在R中计算大型数据集上的mann-kendall检验的并行过程
让我们假设有一个世界上大量点的每月时间步长的大型气候数据集。然后数据集被塑造成一个加速在R中计算大型数据集上的mann-kendall检验的并行过程,r,parallel-processing,R,Parallel Processing,让我们假设有一个世界上大量点的每月时间步长的大型气候数据集。然后数据集被塑造成一个数据。框架的类型: lon,lat,数据月1年1,…,数据月12年100 例如: set.seed(123) data<- data.frame(cbind(runif(10000,-180,180), runif(10000,-90,90)) , replicate(1200, runif(10000,0,150))) 这运行得很好,给了我确切的目标:一个矩阵,报告每个坐标和月份组合的M-K统计数据。尽管
数据。框架
的类型:
lon,lat,数据月1年1,…,数据月12年100
例如:
set.seed(123)
data<- data.frame(cbind(runif(10000,-180,180), runif(10000,-90,90))
, replicate(1200, runif(10000,0,150)))
这运行得很好,给了我确切的目标:一个矩阵,报告每个坐标和月份组合的M-K统计数据。尽管过程是并行的,但是计算仍然需要相当长的时间
有没有办法加快这一进程?是否有使用
apply
系列函数的空间?最后,通过使用lappy
函数(受此启发)替换第二个循环,问题很容易得到解决。执行时间现在控制在几秒钟之内。矢量化仍然是R中执行时间的最佳解决方案(请参阅和)
我在下面分享最终代码以供参考:
set.seed(123)
data<- data.frame(cbind(runif(10000,-180,180), runif(10000,-90,90)), replicate(1200, runif(10000,0,150)))
coords<-data[,1:2]
names(coords)<-c("lon","lat")
data_t<- as.data.frame(t(data[,3:1202]))
data_t$month<-rep(seq(1,12,1),100)
library(foreach)
library(doParallel)
cores=detectCores()
cl <- makeCluster(cores[1]-1) #take all the cores minus 1
registerDoParallel(cl)
mk_out<- foreach(m=1:12, .combine = rbind) %dopar% {
data_m<-data_t[which(data_t$month==m),]
library(trend)
mk_out_temp <- do.call(rbind,lapply(data_m[1:100],function(x)unlist(mk.test(x))))
mk_out_temp <-cbind(coords,mk_out_temp,rep(m,dim(coords)[1]))
mk_out_temp
}
stopCluster(cl)
head(mk_out)
head(mk_out)
lon lat data.name p.value statistic.z null.value.S parameter.n estimates.S estimates.varS
1 -76.47209 -34.09350 x 0.577590398263635 -0.556907839290681 0 100 -188 112750
2 103.78985 -31.58639 x 0.644362383361713 0.461608102085858 0 100 156 112750
3 -32.76831 66.64575 x 0.117932376736468 1.56351131351662 0 100 526 112750
4 137.88627 -30.83872 x 0.79096910003836 0.265052394100912 0 100 90 112750
5 158.56822 -67.37378 x 0.0959591933285242 -1.66476728429674 0 100 -560 112750
6 -163.59966 -25.88014 x 0.823256299016955 0.223358759073802 0 100 76 112750
estimates.tau alternative method pvalg rep(m, dim(coords)[1])
1 -0.037979797979798 two.sided Mann-Kendall trend test 0.577590398263635 1
2 0.0315151515151515 two.sided Mann-Kendall trend test 0.644362383361713 1
3 0.106262626262626 two.sided Mann-Kendall trend test 0.117932376736468 1
4 0.0181818181818182 two.sided Mann-Kendall trend test 0.79096910003836 1
5 -0.113131313131313 two.sided Mann-Kendall trend test 0.0959591933285242 1
6 0.0153535353535354 two.sided Mann-Kendall trend test 0.823256299016955 1
set.seed(123)
数据您注意到您已经解决了问题。可通过以下步骤之一获得:
1:使用.packages
和.export
将必要的对象复制到foreach循环。这可以确保每个实例在尝试访问同一内存时不会发生冲突
2:利用高性能库,如tidyverse of data.table来执行子集设置和计算
后者有点复杂,但在我的微型笔记本电脑上大大提高了性能。(对整个数据集执行所有计算大约需要1.5分钟。)
下面是我添加的代码。注意,我用并行包中的单个parlappy函数替换了foreach
set.seed(123)
data<- data.frame(cbind(runif(10000,-180,180), runif(10000,-90,90))
, replicate(1200, runif(10000,0,150)))
coords<-data[,1:2] #get the coordinates out of the initial dataset
names(coords)<-c("lon","lat")
data_t<- as.data.frame(t(data[,3:1202])) #each column is now the time series associated to a point
data_t$month<-rep(seq(1,12,1),100) # month index as last column of the data frame
# start the parallel processing
library(data.table)
library(parallel)
library(trend)
setDT(data_t)
setDT(coords)
cores=detectCores() #count cores
cl <- makeCluster(cores[1]-1) #take all the cores minus 1 not to overload the pc
#user system elapsed
#17.80 35.12 98.72
system.time({
test <- data_t[,parLapply(cl,
.SD, function(x){
(
unlist(
trend::mk.test(x)[c("p.value","statistic","estimates")]
)
)
}
), by = month] #Perform the calculations across each month
#create a column that indicates what each row is measuring
rows <- rep(c("p.value","statistic.z","estimates.S","estimates.var","estimates.tau"),12)
final_tests <- dcast( #Cast the melted structure to a nice form
melt(cbind(test,rowname = rows), #Melt the data for a better structure
id.vars = c("rowname","month"), #Grouping variables
measure.vars = paste0("V",seq.int(1,10000))), #variable names
month + variable ~ rowname, #LHS groups the data along rows, RHS decides the value columns
value.var = "value", #Which column contain values?
drop = TRUE) #should we drop unused columns? (doesnt matter here)
#rename the columns as desired
names(final_tests) <- c("month","variable","S","tau","var","p.value","z_stat")
#finally add the coordinates
final_tests <- cbind(final_form,coords)
})
set.seed(123)
甚至比使用简单的lappy
解决方案更好。非常感谢@OliverNo problem.:-)对于data.table
的引用,我建议即使您只是将它们当作data.frame来使用,也可以在常规实现中获得相当大的速度提升。:-)R
再次要求调用foreach
循环中的库的原因与此问题有关
set.seed(123)
data<- data.frame(cbind(runif(10000,-180,180), runif(10000,-90,90))
, replicate(1200, runif(10000,0,150)))
coords<-data[,1:2] #get the coordinates out of the initial dataset
names(coords)<-c("lon","lat")
data_t<- as.data.frame(t(data[,3:1202])) #each column is now the time series associated to a point
data_t$month<-rep(seq(1,12,1),100) # month index as last column of the data frame
# start the parallel processing
library(data.table)
library(parallel)
library(trend)
setDT(data_t)
setDT(coords)
cores=detectCores() #count cores
cl <- makeCluster(cores[1]-1) #take all the cores minus 1 not to overload the pc
#user system elapsed
#17.80 35.12 98.72
system.time({
test <- data_t[,parLapply(cl,
.SD, function(x){
(
unlist(
trend::mk.test(x)[c("p.value","statistic","estimates")]
)
)
}
), by = month] #Perform the calculations across each month
#create a column that indicates what each row is measuring
rows <- rep(c("p.value","statistic.z","estimates.S","estimates.var","estimates.tau"),12)
final_tests <- dcast( #Cast the melted structure to a nice form
melt(cbind(test,rowname = rows), #Melt the data for a better structure
id.vars = c("rowname","month"), #Grouping variables
measure.vars = paste0("V",seq.int(1,10000))), #variable names
month + variable ~ rowname, #LHS groups the data along rows, RHS decides the value columns
value.var = "value", #Which column contain values?
drop = TRUE) #should we drop unused columns? (doesnt matter here)
#rename the columns as desired
names(final_tests) <- c("month","variable","S","tau","var","p.value","z_stat")
#finally add the coordinates
final_tests <- cbind(final_form,coords)
})