R 使用Lappy使引导功能更有效
我有一个带有数字列的数据框和一个带有标签的字符列。见示例:R 使用Lappy使引导功能更有效,r,function,dplyr,tidyverse,lapply,R,Function,Dplyr,Tidyverse,Lapply,我有一个带有数字列的数据框和一个带有标签的字符列。见示例: library(tidyverse) a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.00446368
library(tidyverse)
a <- c(0.036210845, 0.005546561, 0.004394322 ,0.006635205, 2.269306824 ,0.013542101, 0.006580308 ,0.006854309,0.009076331 ,0.006577178 ,0.099406840 ,0.010962796, 0.011491922,0.007454443 ,0.004463684,0.005836916,0.011119906 ,0.009543205, 0.003990476, 0.007793532 ,0.020776231, 0.011713687, 0.010045341, 0.008411304, 0.032514994)
b <- c(0.030677829, 0.005210211, 0.004164294, 0.006279456 ,1.095908581 ,0.012029876, 0.006193405 ,0.006486812, 0.008589699, 0.006167356, 0.068956516 ,0.010140064 ,0.010602171 ,0.006898081 ,0.004193735, 0.005447855 ,0.009936211, 0.008743681, 0.003774822, 0.007375678, 0.019695336, 0.010827791, 0.009258572, 0.007960328,0.026956408)
c <- c(0.025855453, 0.004882746 ,0.003946182, 0.005929399 ,0.466284591 ,0.010704604 ,0.005815709, 0.006125196, 0.008110854, 0.005769223, 0.046847336, 0.009356712, 0.009803620 ,0.006366758, 0.003936953 ,0.005072295, 0.008885989 ,0.007989028, 0.003565631, 0.006964512, 0.018636187, 0.010009413, 0.008540876, 0.007516569,0.022227924)
label <- c("fa05","fa05" ,"fa05", "fa10", "fa10", "fa10", "fa20","fa20", "faflat", "faflat", "sa05", "sa05", "sa10" , "sa10" , "sa10" , "sa10", "sa10", "sa10", "sa20", "sa20", "sa20" ,"sa20", "saflat", "saflat", "saflat")
dataframe <- as.data.frame(cbind(a,b,c,label))
dataframe <- dataframe %>%
transform(a = as.numeric(a)) %>%
transform(b = as.numeric(b)) %>%
transform(c = as.numeric(c))
我如何使用lapply使其工作?我如何告诉Lappy重新运行函数1000次?我希望你能帮忙。你可以这样做
replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE)
然而,这真的很慢
> system.time(replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE))
user system elapsed
33.83 0.03 33.87
为了加快速度,我们需要重写samp
函数。这里有一个tidyverse
方法
group_sample_size <- c("fa05" = 3, "fa10" = 3, "fa20" = 2, "faflat" = 2, "sa05" = 2, "sa10" = 6, "sa20" = 4, "saflat" = 3)
prep <- function(df, grp_var, sample_size) {
df %>%
mutate(size = sample_size[.data[[grp_var]]]) %>%
group_by(across(!!grp_var))
}
rep_sample <- function(df, n) {
replicate(
n,
df %>%
slice(sample.int(n(), size[[1L]], replace = TRUE)) %>%
summarise(across(a:c, mean), .groups = "drop"),
simplify = FALSE
)
}
dataframe %>%
prep("label", group_sample_size) %>%
rep_sample(1000)
为了提高效率,我认为下面的data.table
方法会更好
library(data.table)
fsamp <- function(df, grp_var, size, nsim) {
df <- as.data.table(df)
group_info <- table(df[[grp_var]], dnn = list(grp_var))
simu_pool <- df[, -grp_var, with = FALSE]
simu_vars <- names(simu_pool)
simu_pool <- split(simu_pool, df[[grp_var]])
out <- data.table(
simu = rep(seq_len(nsim), each = length(group_info)),
group_info
)
out[
, size := size[out[[grp_var]]]
][
, (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
by = c("simu", grp_var)
][]
}
这三种方法都产生相同的结果集
> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569
> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
simu label N size a b c
1: 1 fa05 3 3 0.015383909 0.013350778 0.011561460
2: 1 fa10 3 3 0.763161377 0.371405971 0.160972865
3: 1 fa20 2 2 0.006717308 0.006340109 0.005970452
4: 1 faflat 2 2 0.009076331 0.008589699 0.008110854
5: 1 sa05 2 2 0.055184818 0.039548290 0.028102024
---
7996: 1000 faflat 2 2 0.007826754 0.007378527 0.006940039
7997: 1000 sa05 2 2 0.099406840 0.068956516 0.046847336
7998: 1000 sa10 6 6 0.006648513 0.006118159 0.005626362
7999: 1000 sa20 4 4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3 3 0.008411304 0.007960328 0.007516569
> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569
> system.time(dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000))
user system elapsed
5.80 0.01 5.81
library(data.table)
fsamp <- function(df, grp_var, size, nsim) {
df <- as.data.table(df)
group_info <- table(df[[grp_var]], dnn = list(grp_var))
simu_pool <- df[, -grp_var, with = FALSE]
simu_vars <- names(simu_pool)
simu_pool <- split(simu_pool, df[[grp_var]])
out <- data.table(
simu = rep(seq_len(nsim), each = length(group_info)),
group_info
)
out[
, size := size[out[[grp_var]]]
][
, (simu_vars) := lapply(simu_pool[[.BY[[grp_var]]]][sample.int(N, size, replace = TRUE)], mean),
by = c("simu", grp_var)
][]
}
> system.time(fsamp(dataframe, "label", group_sample_size, 1000))
user system elapsed
1.47 0.04 1.50
> set.seed(124)
> # rbindlist converts a list of tibbles into a single data.table
> dataframe %>% prep("label", group_sample_size) %>% rep_sample(1000) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569
> set.seed(124)
> fsamp(df, "label", group_sample_size, 1000)
simu label N size a b c
1: 1 fa05 3 3 0.015383909 0.013350778 0.011561460
2: 1 fa10 3 3 0.763161377 0.371405971 0.160972865
3: 1 fa20 2 2 0.006717308 0.006340109 0.005970452
4: 1 faflat 2 2 0.009076331 0.008589699 0.008110854
5: 1 sa05 2 2 0.055184818 0.039548290 0.028102024
---
7996: 1000 faflat 2 2 0.007826754 0.007378527 0.006940039
7997: 1000 sa05 2 2 0.099406840 0.068956516 0.046847336
7998: 1000 sa10 6 6 0.006648513 0.006118159 0.005626362
7999: 1000 sa20 4 4 0.020776231 0.019695336 0.018636187
8000: 1000 saflat 3 3 0.008411304 0.007960328 0.007516569
> set.seed(124)
> replicate(1000, samp(dataframe, quo(label), quo(list)), simplify = FALSE) %>% rbindlist()
label a b c
1: fa05 0.015383909 0.013350778 0.011561460
2: fa10 0.763161377 0.371405971 0.160972865
3: fa20 0.006717308 0.006340109 0.005970452
4: faflat 0.009076331 0.008589699 0.008110854
5: sa05 0.055184818 0.039548290 0.028102024
---
7996: faflat 0.007826754 0.007378527 0.006940039
7997: sa05 0.099406840 0.068956516 0.046847336
7998: sa10 0.006648513 0.006118159 0.005626362
7999: sa20 0.020776231 0.019695336 0.018636187
8000: saflat 0.008411304 0.007960328 0.007516569