R 使用Lappy使引导功能更有效

R 使用Lappy使引导功能更有效,r,function,dplyr,tidyverse,lapply,R,Function,Dplyr,Tidyverse,Lapply,我有一个带有数字列的数据框和一个带有标签的字符列。见示例: library(tidyverse) a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.00446368

我有一个带有数字列的数据框和一个带有标签的字符列。见示例:

library(tidyverse)

a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.004463684,0.005836916,0.011119906 ,0.009543205, 0.003990476, 0.007793532 ,0.020776231, 0.011713687, 0.010045341, 0.008411304, 0.032514994)
b <- c(0.030677829, 0.005210211, 0.004164294, 0.006279456 ,1.095908581 ,0.012029876, 0.006193405 ,0.006486812, 0.008589699, 0.006167356, 0.068956516 ,0.010140064 ,0.010602171 ,0.006898081 ,0.004193735, 0.005447855 ,0.009936211, 0.008743681, 0.003774822, 0.007375678, 0.019695336, 0.010827791, 0.009258572, 0.007960328,0.026956408)
c <- c(0.025855453, 0.004882746 ,0.003946182, 0.005929399 ,0.466284591 ,0.010704604 ,0.005815709, 0.006125196, 0.008110854, 0.005769223, 0.046847336, 0.009356712, 0.009803620 ,0.006366758, 0.003936953 ,0.005072295, 0.008885989 ,0.007989028, 0.003565631, 0.006964512, 0.018636187, 0.010009413, 0.008540876, 0.007516569,0.022227924)
label <- c("fa05","fa05" ,"fa05", "fa10", "fa10",  "fa10", "fa20","fa20", "faflat", "faflat", "sa05", "sa05", "sa10" ,  "sa10" , "sa10" , "sa10", "sa10", "sa10", "sa20", "sa20", "sa20" ,"sa20", "saflat", "saflat", "saflat")
dataframe <- as.data.frame(cbind(a,b,c,label))
dataframe <- dataframe %>%
  transform(a = as.numeric(a)) %>%
  transform(b = as.numeric(b)) %>%
  transform(c = as.numeric(c))
我如何使用lapply使其工作?我如何告诉Lappy重新运行函数1000次?我希望你能帮忙。

你可以这样做

replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE)
然而,这真的很慢

> system.time(replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE))
   user  system elapsed 
  33.83    0.03   33.87
为了加快速度,我们需要重写
samp
函数。这里有一个
tidyverse
方法

group_sample_size <- c("fa05" = 3, "fa10" = 3, "fa20" = 2, "faflat" = 2, "sa05" = 2, "sa10" = 6, "sa20" = 4, "saflat" = 3)

prep <- function(df, grp_var, sample_size) {
  df %>% 
    mutate(size = sample_size[.data[[grp_var]]]) %>% 
    group_by(across(!!grp_var))
}

rep_sample <- function(df, n) {
  replicate(
    n,
    df %>% 
      slice(sample.int(n(), size[[1L]], replace = TRUE)) %>% 
      summarise(across(a:c, mean), .groups = "drop"), 
    simplify = FALSE
  )
}

dataframe %>% 
  prep("label", group_sample_size) %>% 
  rep_sample(1000)
为了提高效率,我认为下面的
data.table
方法会更好

library(data.table)

fsamp <- function(df, grp_var, size, nsim) {
  df <- as.data.table(df)
  group_info <- table(df[[grp_var]], dnn = list(grp_var))
  simu_pool <- df[, -grp_var, with = FALSE]
  simu_vars <- names(simu_pool)
  simu_pool <- split(simu_pool, df[[grp_var]])
  
  out <- data.table(
    simu = rep(seq_len(nsim), each = length(group_info)), 
    group_info
  )
  
  out[
    , size := size[out[[grp_var]]]
  ][
    , (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
    by = c("simu", grp_var)
  ][]
}
这三种方法都产生相同的结果集

> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569

> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
      simu  label N size           a           b           c
   1:    1   fa05 3    3 0.015383909 0.013350778 0.011561460
   2:    1   fa10 3    3 0.763161377 0.371405971 0.160972865
   3:    1   fa20 2    2 0.006717308 0.006340109 0.005970452
   4:    1 faflat 2    2 0.009076331 0.008589699 0.008110854
   5:    1   sa05 2    2 0.055184818 0.039548290 0.028102024
  ---                                                       
7996: 1000 faflat 2    2 0.007826754 0.007378527 0.006940039
7997: 1000   sa05 2    2 0.099406840 0.068956516 0.046847336
7998: 1000   sa10 6    6 0.006648513 0.006118159 0.005626362
7999: 1000   sa20 4    4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3    3 0.008411304 0.007960328 0.007516569

> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569
> system.time(dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000))
   user  system elapsed 
   5.80    0.01    5.81 
library(data.table)

fsamp <- function(df, grp_var, size, nsim) {
  df <- as.data.table(df)
  group_info <- table(df[[grp_var]], dnn = list(grp_var))
  simu_pool <- df[, -grp_var, with = FALSE]
  simu_vars <- names(simu_pool)
  simu_pool <- split(simu_pool, df[[grp_var]])
  
  out <- data.table(
    simu = rep(seq_len(nsim), each = length(group_info)), 
    group_info
  )
  
  out[
    , size := size[out[[grp_var]]]
  ][
    , (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
    by = c("simu", grp_var)
  ][]
}
> system.time(fsamp(dataframe, "label", group_sample_size, 1000))
   user  system elapsed 
   1.47    0.04    1.50
> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569

> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
      simu  label N size           a           b           c
   1:    1   fa05 3    3 0.015383909 0.013350778 0.011561460
   2:    1   fa10 3    3 0.763161377 0.371405971 0.160972865
   3:    1   fa20 2    2 0.006717308 0.006340109 0.005970452
   4:    1 faflat 2    2 0.009076331 0.008589699 0.008110854
   5:    1   sa05 2    2 0.055184818 0.039548290 0.028102024
  ---                                                       
7996: 1000 faflat 2    2 0.007826754 0.007378527 0.006940039
7997: 1000   sa05 2    2 0.099406840 0.068956516 0.046847336
7998: 1000   sa10 6    6 0.006648513 0.006118159 0.005626362
7999: 1000   sa20 4    4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3    3 0.008411304 0.007960328 0.007516569

> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
       label           a           b           c
   1:   fa05 0.015383909 0.013350778 0.011561460
   2:   fa10 0.763161377 0.371405971 0.160972865
   3:   fa20 0.006717308 0.006340109 0.005970452
   4: faflat 0.009076331 0.008589699 0.008110854
   5:   sa05 0.055184818 0.039548290 0.028102024
  ---                                           
7996: faflat 0.007826754 0.007378527 0.006940039
7997:   sa05 0.099406840 0.068956516 0.046847336
7998:   sa10 0.006648513 0.006118159 0.005626362
7999:   sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569