R中列表中相交向量的并集_R_List_Data.table_Lapply_Set Operations

R中列表中相交向量的并集

r list

R中列表中相交向量的并集,r,list,data.table,lapply,set-operations,R,List,Data.table,Lapply,Set Operations,我有一个向量列表，如下所示 data <- list(v1=c("a", "b", "c"), v2=c("g", "h", "k"), v3=c("c", "d"), v4=c("n", "a"), v5=c("h", "i")) Reduce(union, list(data[[1]], data[[3]], data[[4]])) Reduce(union, list(data[[2]], data[[5]]) 如何首先识别相交向量？有没有办法将列

我有一个向量列表，如下所示

data <- list(v1=c("a", "b", "c"), v2=c("g", "h", "k"), 
             v3=c("c", "d"), v4=c("n", "a"), v5=c("h", "i"))

 Reduce(union, list(data[[1]], data[[3]], data[[4]]))
 Reduce(union, list(data[[2]], data[[5]])

如何首先识别相交向量？有没有办法将列表划分为相交向量组的列表

更新下面是使用data.table的尝试。获取所需的结果。但对于这个数据集中的大型列表，速度仍然很慢

数据集。
数据一种选择是使用combn
，然后找到交点。会有更简单的选择
indx <- combn(names(data),2)
lst <- lapply(split(indx, col(indx)), 
        function(i) Reduce(`intersect`,data[i]))
indx1 <- names(lst[sapply(lst, length)>0])
indx2 <- indx[,as.numeric(indx1)]
indx3 <- apply(indx2,2, sort)
lapply(split(1:ncol(indx3), indx3[1,]),
   function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE)))
#$v1
#[1] "a" "b" "c" "d" "n"

#$v2
#[1] "g" "h" "k" "i"

与最快的
相比。这些是基于OP对@docendo discimus的评论而修改的函数
akrun2M <- function(){
     ind <- sapply(seq_along(data), function(i){#copied from @docendo discimus
            !any(data[[i]] %in% unlist(data[-i]))
              })
     data1 <- data[!ind] 
     indx <- combnPrim(names(data1),2)
     lst <- lapply(split(indx, col(indx)), 
              function(i) Reduce(`intersect`,data1[i]))
     indx1 <- names(lst[sapply(lst, length)>0])
     indx2 <- indx[,as.numeric(indx1)]
     indx3 <- apply(indx2,2, sort)
     c(data[ind],lapply(split(1:ncol(indx3), indx3[1,]),
        function(i) unique(unlist(data[c(indx3[,i])], use.names=FALSE))))
   } 

doc2 <- function(){
      x <- lapply(seq_along(data), function(i) {
          if(!any(data[[i]] %in% unlist(data[-i]))) {
               data[[i]]
           } 
          else {
            z <- unlist(data[names(unlist(lapply(data[-c(1:i)],
                                     intersect, data[[i]])))]) 
          if (is.null(z)){ 
               z
               }
          else union(data[[i]], z)
        }
   })
x[!sapply(x, is.null)]
}

这有点像一个图形问题，所以我喜欢使用igraph
库，使用您的示例数据，您可以
library(igraph)
#build edgelist
el <- do.call("rbind",lapply(data, embed, 2))
#make a graph
gg <- graph.edgelist(el, directed=F)
#partition the graph into disjoint sets
split(V(gg)$name, clusters(gg)$membership)

# $`1`
# [1] "b" "a" "c" "d" "n"
# 
# $`2`
# [1] "h" "g" "k" "i"

这里是另一种只使用基本R的方法
更新
akrun评论后的下一次更新及其示例数据：
data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))

数据效率真该死，你们这些人睡觉吗？仅以R为基数，比最快答案慢得多。既然是我写的，不妨把它贴出来
f.union = function(x) {
  repeat{
    n = length(x)
    m = matrix(F, nrow = n, ncol = n)
    for (i in 1:n){
      for (j in 1:n) {
        m[i,j] = any(x[[i]] %in% x[[j]])
      }
    }
    o = apply(m, 2, function(v) Reduce(union, x[v]))
    if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
  }
}

f.union(data)

[[1]]
[1] "a" "b" "c" "d" "n"

[[2]]
[1] "g" "h" "k" "i"

因为我喜欢慢。（加载的库在基准之外）
单位：微秒
expr最小lq平均uq最大neval
vlo（）896.435 1070.6540 1315.8194 1129.4710 1328.6630 7859.999 1000
akrun（）596.263 658.6590 789.9889 694.1360 804.9035 3470.158 1000
flick（）805.854 928.8160 1160.9509 1001.8345 1172.0965 5780.824 1000
josh（）2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909 1000一般来说，你不能比Floyd Warshall算法做得更好/更快，如下所示：
library(Rcpp)

cppFunction(
  "LogicalMatrix floyd(LogicalMatrix w){
    int n = w.nrow();
    for( int k = 0; k < n; k++ )
     for( int i = 0; i < (n-1); i++ )
      for( int j = i+1; j < n; j++ ) 
       if( w(i,k) && w(k,j) ) {
        w(i,j) = true;
        w(j,i) = true;
       }
   return w;
}")

fw.union<-function(x) {
  n<-length(x)
  w<-matrix(F,nrow=n,ncol=n)
  for( i in 1:n ) {
   w[i,i]<-T
  }
  for( i in 1:(n-1) ) {
   for( j in (i+1):n ) {
     w[i,j]<-w[j,i]<- any(x[[i]] %in% x[[j]])
   }
  }
 apply( unique( floyd(w) ), 1, function(y) { Reduce(union,x[y]) } )
}

库（Rcpp）
cpp函数(
“LogicalMatrix floyd（LogicalMatrix w）{
int n=w.nrow（）；
对于（int k=0；kfw.union我遇到了一个类似的问题，促使我到处寻找解决方案。我终于找到了一个非常好的函数，这要感谢这里的许多伟大的贡献者，但是当我看到这篇文章时，我想我会为此编写自己的自定义函数。它实际上并不优雅，速度也太慢，但我认为它相当有效，目前可以做到这一点，直到我做出一些改进：
anoush <- function(x) {
# First we check whether x is a list

  stopifnot(is.list(x)) 

# Then we take every element of the input and calculate the intersect between
# that element & others. In case there were some we would store the indices 
# in `vec` vector. So in the end we have a list called `ind` whose elements 
# are all the indices connected with the corresponding elements of the original 
# list for example first element of `ind` is `1`, `2`, `3` which means in 
# the original list these elements have common values.
  
  ind <- lapply(1:length(x), function(a) {
    vec <- c()
    for(i in 1:length(x)) {
      if(length(unique(base::intersect(x[[a]], x[[i]]))) > 0) {
        vec <- c(vec, i)
      }
    }
    vec 
    })

# Then we go on to again compare each element of `ind` with other elements
# in case there were any intersect, we will calculate the `union` of them.
# for each element we will end up with a list of accumulated values but
# but in the end we use `Reduce` to capture only the last one. So for each
# element of `ind` we end up having a collection of indices that also 
# result in duplicated values. For example elements `1` through `5` of 
# `dup_ind` contains the same value cause in the original list these 
# elements have common values.

  dup_ind <- lapply(1:length(ind), function(a) {
    out <- c()
    for(i in 1:length(ind)) {
      if(length(unique(base::intersect(ind[[a]], ind[[i]]))) > 0) {
        out[[i]] <- union(ind[[a]], ind[[i]])
      }
      vec2 <- Reduce("union", out)
    }
    vec2
  }) 

# Here we get rid of the duplicated elements of the list by means of 
# `relist` funciton and since in this process all the duplicated elements
# will turn to `integer(0)` I have filtered those out.
  
  un <- unlist(dup_ind)
  res <- Map(`[`, dup_ind, relist(!duplicated(un), skeleton = dup_ind))
  res2 <- Filter(length, res)
  
  sapply(res2, function(a) unique(unlist(lapply(a, function(b) `[[`(x, b)))))
  
}

亲爱的@akrun的数据样本
> anoush(data)

[[1]]
[1] "a" "b" "c" "d" "n"

[[2]]
[1] "g" "h" "k" "i"

data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))

> anoush(data)
[[1]]
[1] "g" "k"

[[2]]
[1] "a" "b" "c" "d"

data-anoush（数据）
[[1]]
[1] “g”“k”
[[2]]
[1] “a”“b”“c”“d”
我太累了。去睡觉。我认为有第五个答案需要RGBL，但我可能只是在想象。你的解决方案是最快的。好吧，RBGL是一种很酷的生物导体封装。刚刚运行了作者删除的第五个答案。“咖啡！因为你死后可以睡觉！”（不是我的；例如，在CafePress很容易找到。）@Crops，说得对！我用一个修改过的函数更新了我的答案这似乎与示例数据表现不好：data Good catch，@MrFlick！我再次更新了我的答案。@Docendiscimus刚刚看到了你的更新。但是，它对data@akrun仍然不起作用，我提供了另一个更新的答案。你可能也想检查一下你的电脑，看看有没有这些好的解决方案，但内存可能是个瓶颈。
x <- lapply(seq_along(data), function(i) {
  if(!any(data[[i]] %in% unlist(data[-i]))) {
    data[[i]]
  } else if (any(data[[i]] %in% unlist(data[seq_len(i-1)]))) {
    NULL 
  } else {
    z <- lapply(data[-seq_len(i)], intersect,  data[[i]]) 
    z <- names(z[sapply(z, length) >= 1L])
    if (is.null(z)) NULL else union(data[[i]], unlist(data[z]))
  }
})
x[!sapply(x, is.null)]
#[[1]]
#[1] "g" "k"
#
#[[2]]
#[1] "a" "b" "c" "d"

f.union = function(x) {
  repeat{
    n = length(x)
    m = matrix(F, nrow = n, ncol = n)
    for (i in 1:n){
      for (j in 1:n) {
        m[i,j] = any(x[[i]] %in% x[[j]])
      }
    }
    o = apply(m, 2, function(v) Reduce(union, x[v]))
    if (all(apply(m, 1, sum)==1)) {return(o)} else {x=unique(o)}
  }
}

f.union(data)

[[1]]
[1] "a" "b" "c" "d" "n"

[[2]]
[1] "g" "h" "k" "i"

Unit: microseconds
    expr      min        lq      mean    median        uq       max neval
   vlo()  896.435 1070.6540 1315.8194 1129.4710 1328.6630  7859.999  1000
 akrun()  596.263  658.6590  789.9889  694.1360  804.9035  3470.158  1000
 flick()  805.854  928.8160 1160.9509 1001.8345 1172.0965  5780.824  1000
  josh() 2427.752 2693.0065 3344.8671 2943.7860 3524.1550 16505.909  1000 <- deleted :-(
   doc()  254.462  288.9875  354.6084  302.6415  338.9565  2734.795  1000

library(Rcpp)

cppFunction(
  "LogicalMatrix floyd(LogicalMatrix w){
    int n = w.nrow();
    for( int k = 0; k < n; k++ )
     for( int i = 0; i < (n-1); i++ )
      for( int j = i+1; j < n; j++ ) 
       if( w(i,k) && w(k,j) ) {
        w(i,j) = true;
        w(j,i) = true;
       }
   return w;
}")

fw.union<-function(x) {
  n<-length(x)
  w<-matrix(F,nrow=n,ncol=n)
  for( i in 1:n ) {
   w[i,i]<-T
  }
  for( i in 1:(n-1) ) {
   for( j in (i+1):n ) {
     w[i,j]<-w[j,i]<- any(x[[i]] %in% x[[j]])
   }
  }
 apply( unique( floyd(w) ), 1, function(y) { Reduce(union,x[y]) } )
}

anoush <- function(x) {
# First we check whether x is a list

  stopifnot(is.list(x)) 

# Then we take every element of the input and calculate the intersect between
# that element & others. In case there were some we would store the indices 
# in `vec` vector. So in the end we have a list called `ind` whose elements 
# are all the indices connected with the corresponding elements of the original 
# list for example first element of `ind` is `1`, `2`, `3` which means in 
# the original list these elements have common values.
  
  ind <- lapply(1:length(x), function(a) {
    vec <- c()
    for(i in 1:length(x)) {
      if(length(unique(base::intersect(x[[a]], x[[i]]))) > 0) {
        vec <- c(vec, i)
      }
    }
    vec 
    })

# Then we go on to again compare each element of `ind` with other elements
# in case there were any intersect, we will calculate the `union` of them.
# for each element we will end up with a list of accumulated values but
# but in the end we use `Reduce` to capture only the last one. So for each
# element of `ind` we end up having a collection of indices that also 
# result in duplicated values. For example elements `1` through `5` of 
# `dup_ind` contains the same value cause in the original list these 
# elements have common values.

  dup_ind <- lapply(1:length(ind), function(a) {
    out <- c()
    for(i in 1:length(ind)) {
      if(length(unique(base::intersect(ind[[a]], ind[[i]]))) > 0) {
        out[[i]] <- union(ind[[a]], ind[[i]])
      }
      vec2 <- Reduce("union", out)
    }
    vec2
  }) 

# Here we get rid of the duplicated elements of the list by means of 
# `relist` funciton and since in this process all the duplicated elements
# will turn to `integer(0)` I have filtered those out.
  
  un <- unlist(dup_ind)
  res <- Map(`[`, dup_ind, relist(!duplicated(un), skeleton = dup_ind))
  res2 <- Filter(length, res)
  
  sapply(res2, function(a) unique(unlist(lapply(a, function(b) `[[`(x, b)))))
  
}

> anoush(data)

[[1]]
[1] "a" "b" "c" "d" "n"

[[2]]
[1] "g" "h" "k" "i"

data <- list(v1=c('g', 'k'), v2= letters[1:4], v3= c('b', 'c', 'd', 'a'))

> anoush(data)
[[1]]
[1] "g" "k"

[[2]]
[1] "a" "b" "c" "d"