Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/date/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
使用for循环和过滤器优化代码_R_For Loop_Optimization_Profiling - Fatal编程技术网

使用for循环和过滤器优化代码

使用for循环和过滤器优化代码,r,for-loop,optimization,profiling,R,For Loop,Optimization,Profiling,我得到了一个为这个问题简化的巨大数据集,我尝试在一个特定列的函数中对每一行应用一个函数 我尝试了for-loop方法,然后使用Rprof和profvis进行了一些评测。我知道我可以尝试一些apply或其他方法,但分析似乎表明,最慢的部分是由于其他步骤 这就是我想做的: library(dplyr) # Example data frame id <- rep(c(1:100), each = 5) ab <- runif(length(id), 0, 1) char1 <-

我得到了一个为这个问题简化的巨大数据集,我尝试在一个特定列的函数中对每一行应用一个函数

我尝试了for-loop方法,然后使用
Rprof
profvis
进行了一些评测。我知道我可以尝试一些apply或其他方法,但分析似乎表明,最慢的部分是由于其他步骤

这就是我想做的:

library(dplyr)

# Example data frame
id <- rep(c(1:100), each = 5)
ab <- runif(length(id), 0, 1)
char1 <- runif(length(id), 0, 1)
char2 <- runif(length(id), 0, 1)
dat <- data.frame(cbind(id, ab, char1, char2))

dat$result <- NA

# Loop
com <- unique(id)
for (k in com){
  dat_k <- filter(dat, id==k) # slowest line
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line                                                    
} 
库(dplyr)
#示例数据帧

id下面的for循环稍微快一点。不需要dplyr或which语句

for (k in com){
  dat_k <- dat[id == k, ] # no need for filter
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[id==k, "result"] <- dat_k$result # 2nd no need for which 
} 
for(com中的k){

dat_k下面的for循环速度稍微快一点。不需要dplyr或which语句

for (k in com){
  dat_k <- dat[id == k, ] # no need for filter
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[id==k, "result"] <- dat_k$result # 2nd no need for which 
} 
for(com中的k){

dat_k我通过
lappy
获得了一个较小的加速:

library(microbenchmark)
microbenchmark(
  OP=
for (k in com){
  dat_k <- filter(dat, id==k) # slowest line
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line                                                    
}, 
  phiver=
for (k in com){
  dat_k <- dat[id == k, ] # no need for filter
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[id==k, "result"] <- dat_k$result # 2nd no need for which 
},

  alex= {
dat2 <- split(dat, factor(dat$id))
dat2 <- lapply(dat2, function(l) {
  dat_k_dist <- cluster::daisy(l[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
  denom <- sum(l[, "ab"]) - l[, "ab"]
  l[, "result"] <- as.numeric(num / denom)
  return(l)
})
  dat$result <- Reduce("c",lapply(dat2, function(l) l$result))
})

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval cld
    OP 126.72184 129.94344 133.47666 132.11949 134.14558 196.44860   100   c
    phiver  73.78996  77.13434  79.61202  78.21638  79.81958 139.15854   100  b 
    alex  67.86450  71.61277  73.26273  72.34813  73.50353  90.31229   100 a  
库(微基准)
微基准(
OP=
对于(com中的k){

dat_k我通过
lappy
获得了一个较小的加速:

library(microbenchmark)
microbenchmark(
  OP=
for (k in com){
  dat_k <- filter(dat, id==k) # slowest line
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[which(dat$id==k), "result"] <- dat_k$result # 2nd slowest line                                                    
}, 
  phiver=
for (k in com){
  dat_k <- dat[id == k, ] # no need for filter
  dat_k_dist <- cluster::daisy(dat_k[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * dat_k[, "ab"]))
  denom <- sum(dat_k[, "ab"]) - dat_k[, "ab"]
  dat_k[, "result"] <- as.numeric(num / denom)
  dat[id==k, "result"] <- dat_k$result # 2nd no need for which 
},

  alex= {
dat2 <- split(dat, factor(dat$id))
dat2 <- lapply(dat2, function(l) {
  dat_k_dist <- cluster::daisy(l[, c("char1", "char2")], metric = "gower") %>% as.matrix()
  num <- apply(dat_k_dist, 2, function(x) sum(x * l[, "ab"]))
  denom <- sum(l[, "ab"]) - l[, "ab"]
  l[, "result"] <- as.numeric(num / denom)
  return(l)
})
  dat$result <- Reduce("c",lapply(dat2, function(l) l$result))
})

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval cld
    OP 126.72184 129.94344 133.47666 132.11949 134.14558 196.44860   100   c
    phiver  73.78996  77.13434  79.61202  78.21638  79.81958 139.15854   100  b 
    alex  67.86450  71.61277  73.26273  72.34813  73.50353  90.31229   100 a  
库(微基准)
微基准(
OP=
对于(com中的k){

dat_k dplyr不是“最快”的库,data.table是一个快得多的库(也比基本切片/切割快),你可以很好地使用这个dplyr不是“最快”的库,data.table是一个快得多的库(也比基本切片/切割快),你可以很好地使用这个库