R-加速嵌套循环(矢量化?),并对不同大小的序列进行调节
对于具有3列(X位置、Y位置和某些值VAL)的数据集,我希望对某些XxY间隔/箱中包含的所有VAL执行一些操作(例如,平均值)(即,我希望网格化我的空间) 我最初编写了下面的简单函数来实现这一点(R-加速嵌套循环(矢量化?),并对不同大小的序列进行调节,r,R,对于具有3列(X位置、Y位置和某些值VAL)的数据集,我希望对某些XxY间隔/箱中包含的所有VAL执行一些操作(例如,平均值)(即,我希望网格化我的空间) 我最初编写了下面的简单函数来实现这一点(myT是传递的数据集,xbounds和ybounds是连续间隔间隔符(bin)的向量): 这非常难看,因此我尝试了以下变体: calcPerBin2.2 <- function(myT,xbounds, ybounds, sizeofbin) { newT <- data.frame(m
myT
是传递的数据集,xbounds
和ybounds
是连续间隔间隔符(bin)的向量):
这非常难看,因此我尝试了以下变体:
calcPerBin2.2 <- function(myT,xbounds, ybounds, sizeofbin) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
xcut <- cut(myT$X, breaks=xbounds)
ycut <- cut(myT$Y, breaks=ybounds)
xycut <- expand.grid(XCUT=levels(xcut), YCUT=levels(ycut))
xylowers <- cbind(xlower = as.numeric(sub("\\((.+),.*", "\\1", xycut$XCUT) ),
ylower = as.numeric(sub("\\((.+),.*", "\\1", xycut$YCUT) ))
res <- vector()
for (i in 1:dim(xycut)[1]) {
cond <- (xcut==xycut$XCUT[i] & ycut==xycut$YCUT[i])
res <- c(res, mean(myT$VAL[cond]))
}
newT$MEAN <- res
newT$X <- xylowers[,1]+sizeofbin/2
newT$Y <- xylowers[,2]+sizeofbin/2
return(newT)
}
calcPerBin2.2您可以在三行中完成大部分操作(使用zoo
forrollmean
):
将其与原始函数的结果进行比较:
test1 <- calcPerBin1(data, xbins, ybins)
head(test1)
# X Y MEAN
#1 25 25 900.8305
#2 25 75 5957.4972
#3 25 125 15680.8103
#4 25 175 30877.6696
#5 25 225 50688.4860
#6 25 275 75961.8558
test1非常感谢,@初学者!这确实是一个很好的方法,尤其是对于dplyr
包,我很乐意深入研究它!在我看来,人们似乎可以在R
中找到几乎所有功能的有效函数(当然,如果使用巧妙的组合的话)。。。这可以节省大量代码和时间(当它们已知时)!你说得对,@ztl。在R中有一个陡峭的学习曲线,但一旦你习惯了它,R就会提供很多专业和高效的工具。
calcPerBin2.2 <- function(myT,xbounds, ybounds, sizeofbin) {
newT <- data.frame(matrix(0, nrow=(length(xbounds)-1)*(length(ybounds)-1), ncol=3))
names(newT) <- c("X","Y","MEAN")
xcut <- cut(myT$X, breaks=xbounds)
ycut <- cut(myT$Y, breaks=ybounds)
xycut <- expand.grid(XCUT=levels(xcut), YCUT=levels(ycut))
xylowers <- cbind(xlower = as.numeric(sub("\\((.+),.*", "\\1", xycut$XCUT) ),
ylower = as.numeric(sub("\\((.+),.*", "\\1", xycut$YCUT) ))
res <- vector()
for (i in 1:dim(xycut)[1]) {
cond <- (xcut==xycut$XCUT[i] & ycut==xycut$YCUT[i])
res <- c(res, mean(myT$VAL[cond]))
}
newT$MEAN <- res
newT$X <- xylowers[,1]+sizeofbin/2
newT$Y <- xylowers[,2]+sizeofbin/2
return(newT)
}
# Control parameters
xmax <- 500
ymax <- 1000
N <- 100000
binsize <- 50
xbins <- seq(0,xmax,binsize)
ybins <- seq(0,ymax,binsize) # xbins and ybins do NOT have the same size
# Generate dummy data
xcoords <- runif(N, 1, xmax)
ycoords <- runif(N, 1, ymax)
vals <- xcoords+ycoords**2
data <- data.frame(cbind(X=xcoords, Y=ycoords, VAL=vals))
# Run
system.time(test1 <- calcPerBin1(data, xbins, ybins))
system.time(test2 <- calcPerBin2(data, xbins, ybins))
system.time(test2.2 <- calcPerBin2.2(data, xbins, ybins, binsize))
library(zoo) # load the package
data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
res <- aggregate(VAL ~ X + Y, data, mean)
# order it the same way as in test1, then show the first lines
head(res[order(res$X, res$Y),])
# X Y VAL
#1 25 25 900.8305
#11 25 75 5957.4972
#21 25 125 15680.8103
#31 25 175 30877.6696
#41 25 225 50688.4860
#51 25 275 75961.8558
test1 <- calcPerBin1(data, xbins, ybins)
head(test1)
# X Y MEAN
#1 25 25 900.8305
#2 25 75 5957.4972
#3 25 125 15680.8103
#4 25 175 30877.6696
#5 25 225 50688.4860
#6 25 275 75961.8558
fastbin <- function(data, xbins, ybins){
data$X <- cut(data$X, xbins, labels = rollmean(xbins, 2))
data$Y <- cut(data$Y, ybins, labels = rollmean(ybins, 2))
aggregate(VAL ~ X + Y, data, mean)
}
library(dplyr) # for faster aggregation
fastbin.dplyr <- function(data, xbins, ybins){
data %>%
mutate(X = cut(X, xbins, labels = rollmean(xbins, 2)),
Y = cut(Y, ybins, labels = rollmean(ybins, 2))) %>%
group_by(X, Y) %>%
summarise(Val = mean(VAL))
}
system.time(test1 <- calcPerBin1(data, xbins, ybins))
User System elapsed
3.47 0.12 3.59
system.time(res.fastbin <- fastbin(data, xbins, ybins))
User System elapsed
1.01 0.02 1.05
system.time(res.fastbin.dplyr <- fastbin.dplyr(data, xbins, ybins))
User System elapsed
0.06 0.00 0.06