Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/69.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/loops/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R-按组计算所有数据切割的差异_R_Loops - Fatal编程技术网

R-按组计算所有数据切割的差异

R-按组计算所有数据切割的差异,r,loops,R,Loops,我有一个具有多个属性和一个值的数据集 输入(示例) 我想: 确定CAT和TYP 对于每个组合,计算删除组合时的平均值 返回最终的差异表 最终表格(样本) 速度结果 > system.time(fcnDiffCalc()) user system elapsed 0.30 0.02 0.31 考虑使用sapply分配DIFF列,而不是在循环中增加数据帧,以避免重复的内存内复制: fcnDiffCalc2 <- function() { # table

我有一个具有多个属性和一个值的数据集

输入(示例)

我想:

  • 确定
    CAT
    TYP
  • 对于每个组合,计算删除组合时的平均值
  • 返回最终的差异表
  • 最终表格(样本)

    速度结果

    > system.time(fcnDiffCalc())
       user  system elapsed 
       0.30    0.02    0.31 
    
    考虑使用
    sapply
    分配DIFF列,而不是在循环中增加数据帧,以避免重复的内存内复制:

    fcnDiffCalc2 <- function() {
      # table of all combinations of CAT and TYP
      splits <- data.frame(expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))), 
                           stringsAsFactors = F))
    
      # loop through each combination and calculate the difference between group X and Y
      splits$DIFF <- sapply(1:nrow(splits), function(i) {
    
        split.i <- splits[i,]
    
        # determine non-na columns
        by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
    
        # anti-join to remove records that match `split.i`
        df.i <- tryCatch(df %>%
            anti_join(split.i, by = by.cols), error = function(e) df)
    
        # calculate average by group
        df.i <- df.i %>%
          group_by(GRP) %>%
          summarize(VAL_MEAN = mean(VAL))
    
        # calculate difference of averages
        DIFF <- df.i[,2] %>%
          as.matrix() %>%
          diff() %>%
          as.numeric()
      })
    
      return(splits)
    }
    
    输出

    df_op <- fcnDiffCalc() 
    df_new <- fcnDiffCalc2()
    df_new2 <- fcnDiffCalc3()
    
    identical(df_op, df_new)
    # [1] TRUE
    identical(df_op, df_new2)
    # [1] TRUE
    
    library(microbenchmark)
    
    microbenchmark(fcnDiffCalc(), fcnDiffCalc2(), fcnDiffCalc3())
    
    # Unit: milliseconds
    #            expr      min       lq     mean   median       uq      max neval
    #   fcnDiffCalc() 128.1442 140.1946 152.0703 154.3662 159.6809 180.5960   100
    #  fcnDiffCalc2() 115.4415 126.6108 138.0991 137.4108 145.2452 266.3297   100
    #  fcnDiffCalc3() 107.6847 116.9920 126.9131 126.0414 133.3887 227.2758   100
    
    df_op
    
    > system.time(fcnDiffCalc())
       user  system elapsed 
       0.30    0.02    0.31 
    
    fcnDiffCalc2 <- function() {
      # table of all combinations of CAT and TYP
      splits <- data.frame(expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))), 
                           stringsAsFactors = F))
    
      # loop through each combination and calculate the difference between group X and Y
      splits$DIFF <- sapply(1:nrow(splits), function(i) {
    
        split.i <- splits[i,]
    
        # determine non-na columns
        by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
    
        # anti-join to remove records that match `split.i`
        df.i <- tryCatch(df %>%
            anti_join(split.i, by = by.cols), error = function(e) df)
    
        # calculate average by group
        df.i <- df.i %>%
          group_by(GRP) %>%
          summarize(VAL_MEAN = mean(VAL))
    
        # calculate difference of averages
        DIFF <- df.i[,2] %>%
          as.matrix() %>%
          diff() %>%
          as.numeric()
      })
    
      return(splits)
    }
    
    fcnDiffCalc3 <- function() {
      # table of all combinations of CAT and TYP
      splits <- data.frame(expand.grid(CAT = c(NA, unique(df$CAT)), TYP = c(NA, unique(df$TYP)),
                                       stringsAsFactors = FALSE))
    
      # loop through each combination and calculate the difference between group X and Y
      splits$DIFF <- vapply(1:nrow(splits), function(i) {
    
        split.i <- splits[i,]
    
        # determine non-na columns
        by.cols <- colnames(split.i)[vapply(split.i, function(x) !all(is.na(x)), logical(1))]
    
        # anti-join to remove records that match `split.i`
        df.i <- tryCatch(anti_join(df, split.i, by = by.cols), error = function(e) df)
    
        # calculate average by group
        df.i <- aggregate(VAL ~ GRP, df.i, mean)
    
        # calculate difference of averages
        diff(df.i$VAL)
    
      }, numeric(1))
    
      return(splits)
    }
    
    df_op <- fcnDiffCalc() 
    df_new <- fcnDiffCalc2()
    df_new2 <- fcnDiffCalc3()
    
    identical(df_op, df_new)
    # [1] TRUE
    identical(df_op, df_new2)
    # [1] TRUE
    
    library(microbenchmark)
    
    microbenchmark(fcnDiffCalc(), fcnDiffCalc2(), fcnDiffCalc3())
    
    # Unit: milliseconds
    #            expr      min       lq     mean   median       uq      max neval
    #   fcnDiffCalc() 128.1442 140.1946 152.0703 154.3662 159.6809 180.5960   100
    #  fcnDiffCalc2() 115.4415 126.6108 138.0991 137.4108 145.2452 266.3297   100
    #  fcnDiffCalc3() 107.6847 116.9920 126.9131 126.0414 133.3887 227.2758   100