在R中-从data.frame中的所有行生成成对data.frame
我有一个名为在R中-从data.frame中的所有行生成成对data.frame,r,dataframe,data.table,dplyr,R,Dataframe,Data.table,Dplyr,我有一个名为df的data.frame,在4列上有800万个观察值: name <- c("Pablo", "Christina", "Steve", "Diego", "Ali", "Brit", "Ruth", "Mia", "David", "Dylan") year <- seq(2000, 2009, 1) v1 <- sample(1:10, 10, replace=T) v2 <- sample(1:10, 10, replace=T) df <- d
df
的data.frame,在4列上有800万个观察值:
name <- c("Pablo", "Christina", "Steve", "Diego", "Ali", "Brit", "Ruth", "Mia", "David", "Dylan")
year <- seq(2000, 2009, 1)
v1 <- sample(1:10, 10, replace=T)
v2 <- sample(1:10, 10, replace=T)
df <- data.frame(year, v1)
> df
name year v1 v2
1 Pablo 2000 2 9
2 Christina 2001 5 3
3 Steve 2002 8 9
4 Diego 2003 7 6
5 Ali 2004 2 4
6 Brit 2005 1 1
7 Ruth 2006 10 9
8 Mia 2007 6 7
9 David 2008 10 9
10 Dylan 2009 3 2
最快的方法是什么?
tidyr::crossing
将返回所有观察值的组合,但您需要使用集合名
等设置名称。如果不需要自匹配,可以通过在任何唯一ID列上调用dplyr::filter
来删除它们
library(tidyverse)
df_crossed <- df %>%
setNames(paste0(names(.), '_2')) %>%
crossing(df) %>%
filter(name != name_2)
head(df_crossed)
## name_2 year_2 v1_2 v2_2 name year v1 v2
## 1 Pablo 2000 5 5 Christina 2001 7 3
## 2 Pablo 2000 5 5 Steve 2002 1 9
## 3 Pablo 2000 5 5 Diego 2003 2 8
## 4 Pablo 2000 5 5 Ali 2004 9 5
## 5 Pablo 2000 5 5 Brit 2005 8 5
## 6 Pablo 2000 5 5 Ruth 2006 8 1
库(tidyverse)
df_交叉%
集合名称(粘贴0(名称(.),“_2'))%>%
交叉(df)%>%
过滤器(名称!=name_2)
头部(df_交叉)
##名称2年1月2日2月2日2月2日2年1月2日名称1月2日
##1巴勃罗2000 5克里斯蒂娜2001 7 3
##2 Pablo 2000 5 5 Steve 2002 1 9
##3 Pablo 2000 5 5 Diego 2003 2 8
##4 Pablo 2000 5 5 Ali 2004 9 5
##5 Pablo 2000 5 5 Brit 2005 8 5
##6 Pablo 2000 5 5 Ruth 2006 8 1
另一种修复名称的方法是在
交叉后使用guantitor::clean_names
,尽管它是一个额外的包。您可以使用data.table
将名称列交叉连接到自身,并删除重复的大小写。这将导致更小的结构,在该结构上合并数据,而不是进行完全合并,然后进行过滤。您可以通过两次合并来添加其余数据:一次合并与第一个名称列关联的数据,另一次合并与第二个列关联的数据
name <- c("Pablo", "Christina", "Steve", "Diego", "Ali", "Brit", "Ruth", "Mia", "David", "Dylan")
year <- seq(2000, 2009, 1)
v1 <- sample(1:10, 10, replace=T)
v2 <- sample(1:10, 10, replace=T)
# stringsAsFactors = FALSE in order for pmin to work properly
df <- data.frame(name, year, v1, v2, stringsAsFactors = FALSE)
library(data.table)
setDT(df)
setkey(df)
# cross-join name column to itself while removing duplicates and redundancies
name_cj <- setnames(
CJ(df[, name], df[, name])[V1 < V2], # taking a hint from Parfait's clever solution
c("name1", "name2"))
# perform 2 merges, once for the 1st name column and
# again for the 2nd name colum
name_cj <- merge(
merge(name_cj, df, by.x = "name1", by.y = "name"),
df,
by.x = "name2", by.y = "name", suffixes = c("_1", "_2"))
# reorder columns as desired with setorder()
head(name_cj)
# name2 name1 year_1 v1_1 v2_1 year_2 v1_2 v2_2
#1: Brit Ali 2004 3 8 2005 4 5
#2: Christina Ali 2004 3 8 2001 9 8
#3: Christina Brit 2005 4 5 2001 9 8
#4: David Ali 2004 3 8 2008 5 2
#5: David Brit 2005 4 5 2008 5 2
#6: David Christina 2001 9 8 2008 5 2
name这个@alistaires解决方案的扩展显示了一个用作索引的交叉矩阵。如上所述的问题需要完整的交叉输出
将非常大(约6400万行容纳800万个项目),因此
真的无法满足内存需求。然而,如果
在现实世界中,它的用途是处理子集,即索引技术
这里显示的可能是一种减少内存使用的方法。在交叉操作期间,交叉整数可能只使用较少的内存
library(dplyr)
library(tidyr)
crossed <- as.matrix(crossing(1:nrow(df), 1:nrow(df)))
# bind and name in one step (may be inefficient) so that filter can be applied in one step
output <- as.data.frame(cbind(df[crossed[, 1],],
data.frame(name_2 = df[crossed[, 2], 1],
year_2 = df[crossed[, 2], 2],
v1_2 = df[crossed[, 2], 3],
v2_2 = df[crossed[, 2], 4]) )) %>%
filter(!(name == name_2 & year == year_2))
# estimated sized for 8 million rows gine this 10 row sample
format(object.size(output) / (10 / 8e6), units="MB")
#[1] "5304 Mb"
df <- data.frame(name = c("Pablo", "Christina", "Steve", "Diego", "Ali",
"Brit", "Ruth", "Mia", "David", "Dylan"),
year = seq(2000, 2009, 1),
v1 =sample(1:10, 10, replace=T),
v2 =sample(1:10, 10, replace=T),
stringsAsFactors = FALSE)
# MERGE ON KEY, THEN REMOVE KEY COL
df$key <- 1
dfm <- merge(df, df, by="key")[,-1]
# FILTER OUT SAME NAME AND REVERSE DUPS, THEN RENAME COLUMNS
dfm <- setNames(dfm[(dfm$name.x < dfm$name.y),],
c("name_p1", "year_p1", "V1_p1", "V2_p1",
"name_p2", "year_p2", "V1_p2", "V2_p2"))
# ALL PABLO PAIRINGS
dfm[dfm$name_p1=='Pablo' | dfm$name_p2=='Pablo',]
# name_p1 year_p1 V1_p1 V2_p1 name_p2 year_p2 V1_p2 V2_p2
# 3 Pablo 2000 7 8 Steve 2002 3 1
# 7 Pablo 2000 7 8 Ruth 2006 8 4
# 11 Christina 2001 10 10 Pablo 2000 7 8
# 31 Diego 2003 4 9 Pablo 2000 7 8
# 41 Ali 2004 5 3 Pablo 2000 7 8
# 51 Brit 2005 2 4 Pablo 2000 7 8
# 71 Mia 2007 7 7 Pablo 2000 7 8
# 81 David 2008 1 7 Pablo 2000 7 8
# 91 Dylan 2009 9 2 Pablo 2000 7 8
库(dplyr)
图书馆(tidyr)
交叉希望这将给出帖子所有者想要的结果
name <- c("Pablo", "Christina", "Steve", "Diego", "Ali", "Brit", "Ruth", "Mia", "David", "Dylan")
year <- seq(2000, 2009, 1)
v1 <- sample(1:10, 10, replace=T)
v2 <- sample(1:10, 10, replace=T)
df <- data.frame(name, year, v1, v2, stringsAsFactors=FALSE)
print(df)
rows = nrow(df)
n <- rows * (rows - 1) / 2
ndf <- data.frame(
name1=character(n),year1=numeric(n), v1_1=numeric(n),v2_1=numeric(n),
name2=character(n),year2=numeric(n), v1_2=numeric(n),v2_2=numeric(n),
stringsAsFactors=FALSE
)
k <- 1
for (i in 1:(rows-1))
{
for (j in (i+1):rows)
{
ndf[k,] <- c(df[i,], df[j,])
k <- k + 1
}
}
print(ndf)
# name year v1 v2
#1 Pablo 2000 4 9
#2 Christina 2001 2 1
#3 Steve 2002 2 9
#4 Diego 2003 5 5
#5 Ali 2004 10 4
#6 Brit 2005 5 2
#7 Ruth 2006 7 10
#8 Mia 2007 6 7
#9 David 2008 4 10
#10 Dylan 2009 7 3
# name1 year1 v1_1 v2_1 name2 year2 v1_2 v2_2
#1 Pablo 2000 4 9 Christina 2001 2 1
#2 Pablo 2000 4 9 Steve 2002 2 9
#3 Pablo 2000 4 9 Diego 2003 5 5
#4 Pablo 2000 4 9 Ali 2004 10 4
#5 Pablo 2000 4 9 Brit 2005 5 2
#6 Pablo 2000 4 9 Ruth 2006 7 10
#7 Pablo 2000 4 9 Mia 2007 6 7
#8 Pablo 2000 4 9 David 2008 4 10
#9 Pablo 2000 4 9 Dylan 2009 7 3
#10 Christina 2001 2 1 Steve 2002 2 9
#...
<代码>名称< P>不添加噪声,但在同一数据框上考虑一个基r交叉与<代码>合并> /代码>,同时也过滤反向副本。请注意,过滤器之前的交叉连接将返回一个8 mil X 8 mill记录数据集,所以希望您的RAM足以进行这样的操作
library(dplyr)
library(tidyr)
crossed <- as.matrix(crossing(1:nrow(df), 1:nrow(df)))
# bind and name in one step (may be inefficient) so that filter can be applied in one step
output <- as.data.frame(cbind(df[crossed[, 1],],
data.frame(name_2 = df[crossed[, 2], 1],
year_2 = df[crossed[, 2], 2],
v1_2 = df[crossed[, 2], 3],
v2_2 = df[crossed[, 2], 4]) )) %>%
filter(!(name == name_2 & year == year_2))
# estimated sized for 8 million rows gine this 10 row sample
format(object.size(output) / (10 / 8e6), units="MB")
#[1] "5304 Mb"
df <- data.frame(name = c("Pablo", "Christina", "Steve", "Diego", "Ali",
"Brit", "Ruth", "Mia", "David", "Dylan"),
year = seq(2000, 2009, 1),
v1 =sample(1:10, 10, replace=T),
v2 =sample(1:10, 10, replace=T),
stringsAsFactors = FALSE)
# MERGE ON KEY, THEN REMOVE KEY COL
df$key <- 1
dfm <- merge(df, df, by="key")[,-1]
# FILTER OUT SAME NAME AND REVERSE DUPS, THEN RENAME COLUMNS
dfm <- setNames(dfm[(dfm$name.x < dfm$name.y),],
c("name_p1", "year_p1", "V1_p1", "V2_p1",
"name_p2", "year_p2", "V1_p2", "V2_p2"))
# ALL PABLO PAIRINGS
dfm[dfm$name_p1=='Pablo' | dfm$name_p2=='Pablo',]
# name_p1 year_p1 V1_p1 V2_p1 name_p2 year_p2 V1_p2 V2_p2
# 3 Pablo 2000 7 8 Steve 2002 3 1
# 7 Pablo 2000 7 8 Ruth 2006 8 4
# 11 Christina 2001 10 10 Pablo 2000 7 8
# 31 Diego 2003 4 9 Pablo 2000 7 8
# 41 Ali 2004 5 3 Pablo 2000 7 8
# 51 Brit 2005 2 4 Pablo 2000 7 8
# 71 Mia 2007 7 7 Pablo 2000 7 8
# 81 David 2008 1 7 Pablo 2000 7 8
# 91 Dylan 2009 9 2 Pablo 2000 7 8
df-Likeidxtidyr::crossing(df,df)
@lukeA的确如此!但是第二系列变量的列名不同。并且输出为data.frame。@alistaire Right!但第一行包含一个副本(即Pablo Pablo)。是否可以生成不重复的输出?使用cbind.data.frame
或将包装为.data.frame
。之后,您可以使用names(df)重命名列,谢谢Alistaire!这在我的数据的较小子集上非常有效。但一旦子集变大,R就会尝试分配大小为1338 GB的向量。。。不确定如何处理:-)是的,在所需的行数中有阶乘。它也给了你巴勃罗·克里斯蒂娜和克里斯蒂娜·巴勃罗,也就是说,排列,而不是组合。使用combn
进行索引将允许一个更有限的集合(如果您需要的话),但它仍然会很快变大。不过,您可能不需要复制数据;您可以将一行中的函数应用于其他行,只需存储结果即可。也许有一个新的问题需要解决。这种方法不包括相互重复的问题,即Christeina 2001 2 1 Pablo 2000 4 9不包括在内。要包含倒数,for循环和内部循环都需要从1:rows开始,以跳过i==j的情况。当然,ndf的大小也需要重新计算。as.data.frame(cbind(…)
是一个坏习惯,经常导致类型问题。只需使用data.frame
。