R 查找大数据集上的公共第三个
我有一个像R 查找大数据集上的公共第三个,r,large-data,R,Large Data,我有一个像 df <- data.frame(group= c("a","a","b","b","b","c"), person = c("Tom","Jerry","Tom","Anna","Sam","Nic"), stringsAsFactors = FALSE) df group person 1 a Tom 2 a Jerry 3 b Tom 4 b Anna 5 b Sam 6
df <- data.frame(group= c("a","a","b","b","b","c"),
person = c("Tom","Jerry","Tom","Anna","Sam","Nic"), stringsAsFactors = FALSE)
df
group person
1 a Tom
2 a Jerry
3 b Tom
4 b Anna
5 b Sam
6 c Nic
结果dataframe基本上给出了一个表,其中包含所有具有共同点的人对。我在SQL中找到了一种方法,但这需要很长的时间,所以我想知道是否有一种有效的方法可以在R中实现这一点。其基本思想是创建一个图,然后为每个节点提取两个相邻的节点
library(igraph)
X1 = split(df$person, df$group)
X2 = X1[lengths(X1) >= 2]
dat = data.frame(do.call(rbind, unlist(lapply(X2, function(x)
combn(x, 2, sort, FALSE)), recursive = FALSE)))
g = graph.data.frame(dat, directed = FALSE)
mydf = data.frame(as.matrix(get.adjacency(g)))
mydf = mydf[colSums(mydf) > 1]
ANS = sapply(mydf, function(x) t(combn(row.names(mydf)[which(x == 1)], 2)))
do.call(rbind, lapply(names(ANS), function(nm) data.frame(ANS[[nm]], nm)))
# X1 X2 nm
#1 Sam Tom Anna
#2 Anna Tom Sam
#3 Jerry Anna Tom
#4 Jerry Sam Tom
#5 Anna Sam Tom
或
mynames=unique(do.call(c,X2))
打电话(rbind,
lappy(我的名字,函数(x){
L=V(g)$name[未列出(相邻的_顶点(图=g,V=x))]
如果(长度(L)>=2){
集合名(数据帧(t(combn(L,2)),x),c(“P1”,“P2”,“P3”))
}否则{
集合名(数据帧(NA,NA,x),c(“P1”,“P2”,“P3”))
}
}))
#P1 P2 P3
#杰瑞·安娜·汤姆
#杰里·萨姆·汤姆
#安娜·萨姆·汤姆
#杰瑞
#5山姆汤姆安娜
#安娜·汤姆·萨姆
它适用于正常大小的数据集,但当我尝试设置mydf
邻接矩阵(g
的大小几乎为1gb)时,我遇到了实际数据的内存问题。我尝试了,但我想矩阵太大了。也在和稀疏矩阵混在一起,但没能弄明白。我会尝试更多,然后接受你的答案,因为它解决了一般问题。它不会与第二个版本@d.b崩溃,但仍在运行。我会让你知道它是如何运行三天的@d.b,所以我不得不停止它。可能得想办法把它分开
library(igraph)
X1 = split(df$person, df$group)
X2 = X1[lengths(X1) >= 2]
dat = data.frame(do.call(rbind, unlist(lapply(X2, function(x)
combn(x, 2, sort, FALSE)), recursive = FALSE)))
g = graph.data.frame(dat, directed = FALSE)
mydf = data.frame(as.matrix(get.adjacency(g)))
mydf = mydf[colSums(mydf) > 1]
ANS = sapply(mydf, function(x) t(combn(row.names(mydf)[which(x == 1)], 2)))
do.call(rbind, lapply(names(ANS), function(nm) data.frame(ANS[[nm]], nm)))
# X1 X2 nm
#1 Sam Tom Anna
#2 Anna Tom Sam
#3 Jerry Anna Tom
#4 Jerry Sam Tom
#5 Anna Sam Tom
mynames = unique(do.call(c, X2))
do.call(rbind,
lapply(mynames, function(x){
L = V(g)$name[unlist(adjacent_vertices(graph = g, v = x))]
if(length(L) >= 2){
setNames(data.frame(t(combn(L, 2)), x), c("P1", "P2", "P3"))
}else{
setNames(data.frame(NA, NA, x), c("P1", "P2", "P3"))
}
}))
# P1 P2 P3
#1 Jerry Anna Tom
#2 Jerry Sam Tom
#3 Anna Sam Tom
#4 <NA> <NA> Jerry
#5 Sam Tom Anna
#6 Anna Tom Sam