R-按组计算所有数据切割的差异_R_Loops

R-按组计算所有数据切割的差异

r loops

R-按组计算所有数据切割的差异,r,loops,R,Loops,我有一个具有多个属性和一个值的数据集输入（示例）我想：确定CAT和TYP 对于每个组合，计算删除组合时的平均值返回最终的差异表最终表格（样本）速度结果 > system.time(fcnDiffCalc()) user system elapsed 0.30 0.02 0.31 考虑使用sapply分配DIFF列，而不是在循环中增加数据帧，以避免重复的内存内复制： fcnDiffCalc2 <- function() { # table

我有一个具有多个属性和一个值的数据集

输入（示例）

我想：

确定

CAT

和

TYP

对于每个组合，计算删除组合时的平均值

返回最终的差异表

最终表格（样本）

速度结果

> system.time(fcnDiffCalc())
   user  system elapsed 
   0.30    0.02    0.31

考虑使用

sapply

分配DIFF列，而不是在循环中增加数据帧，以避免重复的内存内复制：

fcnDiffCalc2 <- function() {
  # table of all combinations of CAT and TYP
  splits <- data.frame(expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))), 
                       stringsAsFactors = F))

  # loop through each combination and calculate the difference between group X and Y
  splits$DIFF <- sapply(1:nrow(splits), function(i) {

    split.i <- splits[i,]

    # determine non-na columns
    by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]

    # anti-join to remove records that match `split.i`
    df.i <- tryCatch(df %>%
        anti_join(split.i, by = by.cols), error = function(e) df)

    # calculate average by group
    df.i <- df.i %>%
      group_by(GRP) %>%
      summarize(VAL_MEAN = mean(VAL))

    # calculate difference of averages
    DIFF <- df.i[,2] %>%
      as.matrix() %>%
      diff() %>%
      as.numeric()
  })

  return(splits)
}

输出

df_op <- fcnDiffCalc() 
df_new <- fcnDiffCalc2()
df_new2 <- fcnDiffCalc3()

identical(df_op, df_new)
# [1] TRUE
identical(df_op, df_new2)
# [1] TRUE

library(microbenchmark)

microbenchmark(fcnDiffCalc(), fcnDiffCalc2(), fcnDiffCalc3())

# Unit: milliseconds
#            expr      min       lq     mean   median       uq      max neval
#   fcnDiffCalc() 128.1442 140.1946 152.0703 154.3662 159.6809 180.5960   100
#  fcnDiffCalc2() 115.4415 126.6108 138.0991 137.4108 145.2452 266.3297   100
#  fcnDiffCalc3() 107.6847 116.9920 126.9131 126.0414 133.3887 227.2758   100

df_op
> system.time(fcnDiffCalc())
   user  system elapsed 
   0.30    0.02    0.31 

fcnDiffCalc2 <- function() {
  # table of all combinations of CAT and TYP
  splits <- data.frame(expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))), 
                       stringsAsFactors = F))

  # loop through each combination and calculate the difference between group X and Y
  splits$DIFF <- sapply(1:nrow(splits), function(i) {

    split.i <- splits[i,]

    # determine non-na columns
    by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]

    # anti-join to remove records that match `split.i`
    df.i <- tryCatch(df %>%
        anti_join(split.i, by = by.cols), error = function(e) df)

    # calculate average by group
    df.i <- df.i %>%
      group_by(GRP) %>%
      summarize(VAL_MEAN = mean(VAL))

    # calculate difference of averages
    DIFF <- df.i[,2] %>%
      as.matrix() %>%
      diff() %>%
      as.numeric()
  })

  return(splits)
}

fcnDiffCalc3 <- function() {
  # table of all combinations of CAT and TYP
  splits <- data.frame(expand.grid(CAT = c(NA, unique(df$CAT)), TYP = c(NA, unique(df$TYP)),
                                   stringsAsFactors = FALSE))

  # loop through each combination and calculate the difference between group X and Y
  splits$DIFF <- vapply(1:nrow(splits), function(i) {

    split.i <- splits[i,]

    # determine non-na columns
    by.cols <- colnames(split.i)[vapply(split.i, function(x) !all(is.na(x)), logical(1))]

    # anti-join to remove records that match `split.i`
    df.i <- tryCatch(anti_join(df, split.i, by = by.cols), error = function(e) df)

    # calculate average by group
    df.i <- aggregate(VAL ~ GRP, df.i, mean)

    # calculate difference of averages
    diff(df.i$VAL)

  }, numeric(1))

  return(splits)
}

df_op <- fcnDiffCalc() 
df_new <- fcnDiffCalc2()
df_new2 <- fcnDiffCalc3()

identical(df_op, df_new)
# [1] TRUE
identical(df_op, df_new2)
# [1] TRUE

library(microbenchmark)

microbenchmark(fcnDiffCalc(), fcnDiffCalc2(), fcnDiffCalc3())

# Unit: milliseconds
#            expr      min       lq     mean   median       uq      max neval
#   fcnDiffCalc() 128.1442 140.1946 152.0703 154.3662 159.6809 180.5960   100
#  fcnDiffCalc2() 115.4415 126.6108 138.0991 137.4108 145.2452 266.3297   100
#  fcnDiffCalc3() 107.6847 116.9920 126.9131 126.0414 133.3887 227.2758   100