Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/64.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
从R中的两组列创建所有采样组合_R_Dataframe_Group By_Combinations_Permutation - Fatal编程技术网

从R中的两组列创建所有采样组合

从R中的两组列创建所有采样组合,r,dataframe,group-by,combinations,permutation,R,Dataframe,Group By,Combinations,Permutation,我有下面的数据框,在这两个“组”中,A列和B列以及D列和E列。我希望找到所有组合,然后通过在A列和B列以及D列和E列应用不同过滤器的所有组合进行分组,但此时仅从每组中选择一列。我不知道做这件事的正确公式,事实上问题更大 df= 所以要过滤的组合应该是 过滤器1:A=1和D=1 过滤器2:A=1和D=0 过滤器3:A=1和E=1 过滤器4:A=1和E=0 过滤器5:A=0和D=1 过滤器6:A=0和D=0 过滤器7:A=0和E=1 过滤器8:A=0和E=0 过滤器9:B=1和D=1 过滤器10:B

我有下面的数据框,在这两个“组”中,A列和B列以及D列和E列。我希望找到所有组合,然后通过在A列和B列以及D列和E列应用不同过滤器的所有组合进行分组,但此时仅从每组中选择一列。我不知道做这件事的正确公式,事实上问题更大

df=

所以要过滤的组合应该是

过滤器1:A=1和D=1

过滤器2:A=1和D=0

过滤器3:A=1和E=1

过滤器4:A=1和E=0

过滤器5:A=0和D=1

过滤器6:A=0和D=0

过滤器7:A=0和E=1

过滤器8:A=0和E=0

过滤器9:B=1和D=1

过滤器10:B=1和D=0

过滤器11:B=1和E=1

过滤器12:B=1和E=0

过滤器13:B=0和D=1

过滤器14:B=0和D=0

过滤器15:B=0和E=1

过滤器16:B=0和E=0

我想找到一种方法来高效地创建这些过滤器组(总是从列a&B或D&E中绘制一个过滤器),然后找到每个过滤器设置的大小列的平均值和计数。我只是设法做到了这一点,没有不同的小组来采样过滤器

我尝试的形式是:

groupNames%摘要(n=length(Size),avgVar1=mean(Size)))

对四列均等处理,不考虑2组的抽样。我可以对代码做些什么来实现这一点

非常感谢。

库(tidyverse)
library(tidyverse)
df <- tribble(~Size, ~A, ~B, ~D, ~E,
              1, "1", "1", "0", "0",
              5, "0", "0", "1", "0",
              10, "1", "1", "1", "0",
              3, "1", "0", "0", "0",
              2, "1", "1", "1", "1",
              55, "0", "0", "0", "1",
              5, "1", "0", "1", "1",
              2, "0", "0", "1", "1",
              1, "1", "1", "1", "1",
              4, "1", "1", "1", "0")
p <- function(...) paste0(...) # for legibility, should rather use glue

all_filtering_groups <- list(c("A", "B"), c("D", "E")) # assuming these are known
all_combns <- map(1:length(all_filtering_groups), ~ combn(all_filtering_groups, .))
res <- list(length(all_combns))

#microbenchmark::microbenchmark({
for(comb_length in seq_along(all_combns)){
  res[[comb_length]] <- list(ncol(all_combns[[comb_length]]))
  for(col_i in seq_len(ncol(all_combns[[comb_length]]))){
    
    filtering_groups <- all_combns[[comb_length]][,col_i]
    group_names <- as.character(seq_along(filtering_groups))
    
    
    # prepare grid of all combinations
    filtering_combs <- c(filtering_groups, rep(list(0:1), length(filtering_groups)))
    names(filtering_combs) <- c(p("vars_", group_names), p("vals_", group_names))
    full_grid <- expand.grid(filtering_combs)
    
    for(ll in 1:nrow(full_grid)){ # for each line in the full_grid
      # find df lines that correspond
      cond <- as.logical(rep(TRUE, nrow(df)))
      for(grp in group_names){
        cond <- cond & df[[full_grid[p("vars_", grp)][ll,]]] == full_grid[p("vals_", grp)][ll,]
      }
      # and compute whatever
      full_grid$lines[ll] <- paste(which(cond), collapse = ", ") #for visual verification
      full_grid$n[ll] <- length(df$Size[cond])
      full_grid$sum[ll] <- sum(df$Size[cond])
      full_grid$mean[ll] <- mean(df$Size[cond])
    }
    res[[comb_length]][[col_i]] <- full_grid
    
  }
}
#}, times = 10) #microbenchmark

bind_rows(res) %>% relocate(starts_with("vars") | starts_with("vals"))

df根据评论中的讨论,我认为我们可以将组视为变量。因此,我们需要重塑数据帧,使每个因子有一列,然后我们可以使用标准的tidyverse方法。我假设这些组是由列名(A1…Ak,B1…Bk,…)定义的

库(tidyverse)
df%
突变(A=因子(映射chr(A,获取水平)),
B=系数(映射系数(B,获取等级)))
#现在看看因子组合
df_系数%>%
(A,B)组%>%
总结(n=n(),
平均值=平均值(尺寸))
#一个tibble:8x4
#分组:A[3]
#平均数
#          
#1“B1”15
#2“B1、B2”1 2
#3“B2”155
#4“A1”“1”3
#5“A1”B1,B2”1 5
#6“A1,A2”“1 1
#7“A1,A2”“B1”2 7
#8英寸A1、A2英寸B1、B2英寸2 1.5

我明确地称之为“A”和“B”。6组似乎仍然可以做到这一点。如果您有更多,则有必要实现自动化,但我不确定如何轻松做到这一点。

评论不适用于扩展讨论;这段对话已经结束了,谢谢你,阿列克斯洛。我试着用6个组和每组3个过滤器运行它,但计算起来需要很长时间。我意识到它还测试设置为0的过滤器的情况(例如,A1=1、B1=0、C3=0、D1=0、E2=0、F=0。我如何删除它,以便只测试过滤器为1的所有过滤器组合(当然还有根本不使用相应过滤器的NAs)?我不知道为什么它仍然慢。你能找出一行慢一点吗?要删除设置为0的过滤器,你可以在重新编码后使用
filter(stru长度(a)>1)
。要删除所有过滤器为0的行,你可以使用
df_factors%>%rowwise()%%filter(sum(跨越(a:B,stru长度))>1)
library(tidyverse)
df <- tribble(~Size, ~A, ~B, ~D, ~E,
              1, "1", "1", "0", "0",
              5, "0", "0", "1", "0",
              10, "1", "1", "1", "0",
              3, "1", "0", "0", "0",
              2, "1", "1", "1", "1",
              55, "0", "0", "0", "1",
              5, "1", "0", "1", "1",
              2, "0", "0", "1", "1",
              1, "1", "1", "1", "1",
              4, "1", "1", "1", "0")
p <- function(...) paste0(...) # for legibility, should rather use glue

all_filtering_groups <- list(c("A", "B"), c("D", "E")) # assuming these are known
all_combns <- map(1:length(all_filtering_groups), ~ combn(all_filtering_groups, .))
res <- list(length(all_combns))

#microbenchmark::microbenchmark({
for(comb_length in seq_along(all_combns)){
  res[[comb_length]] <- list(ncol(all_combns[[comb_length]]))
  for(col_i in seq_len(ncol(all_combns[[comb_length]]))){
    
    filtering_groups <- all_combns[[comb_length]][,col_i]
    group_names <- as.character(seq_along(filtering_groups))
    
    
    # prepare grid of all combinations
    filtering_combs <- c(filtering_groups, rep(list(0:1), length(filtering_groups)))
    names(filtering_combs) <- c(p("vars_", group_names), p("vals_", group_names))
    full_grid <- expand.grid(filtering_combs)
    
    for(ll in 1:nrow(full_grid)){ # for each line in the full_grid
      # find df lines that correspond
      cond <- as.logical(rep(TRUE, nrow(df)))
      for(grp in group_names){
        cond <- cond & df[[full_grid[p("vars_", grp)][ll,]]] == full_grid[p("vals_", grp)][ll,]
      }
      # and compute whatever
      full_grid$lines[ll] <- paste(which(cond), collapse = ", ") #for visual verification
      full_grid$n[ll] <- length(df$Size[cond])
      full_grid$sum[ll] <- sum(df$Size[cond])
      full_grid$mean[ll] <- mean(df$Size[cond])
    }
    res[[comb_length]][[col_i]] <- full_grid
    
  }
}
#}, times = 10) #microbenchmark

bind_rows(res) %>% relocate(starts_with("vars") | starts_with("vals"))
library(tidyverse)
df <- tribble(~Size, ~A1, ~A2, ~B1, ~B2,
              1, "1", "1", "0", "0",
              5, "0", "0", "1", "0",
              10, "1", "1", "1", "0",
              3, "1", "0", "0", "0",
              2, "1", "1", "1", "1",
              55, "0", "0", "0", "1",
              5, "1", "0", "1", "1",
              2, "0", "0", "1", "1",
              1, "1", "1", "1", "1",
              4, "1", "1", "1", "0")

get_levels <- function(col){
  paste(names(col)[col == "1"], collapse = ",")
}
# Rewrite with groups as factors
df_factors <- df %>%
  mutate(id = row_number()) %>%  #to avoid aggregating same Size
  nest(A = starts_with("A"), B = starts_with("B")) %>%
  mutate(A = factor(map_chr(A, get_levels)),
         B = factor(map_chr(B, get_levels)))

# Now look at factor combinations
df_factors %>%
  group_by(A, B) %>%
  summarize(n = n(),
            mean = mean(Size))

# A tibble: 8 x 4
# Groups:   A [3]
#   A       B           n  mean
#   <fct>   <fct>   <int> <dbl>
# 1 ""      "B1"        1   5  
# 2 ""      "B1,B2"     1   2  
# 3 ""      "B2"        1  55  
# 4 "A1"    ""          1   3  
# 5 "A1"    "B1,B2"     1   5  
# 6 "A1,A2" ""          1   1  
# 7 "A1,A2" "B1"        2   7  
# 8 "A1,A2" "B1,B2"     2   1.5