计算并识别R中的唯一关系
我有一个数据框,有两列,如下所示:计算并识别R中的唯一关系,r,count,R,Count,我有一个数据框,有两列,如下所示: id1 <- c(123,456,789,122,345,678,901,126,567,890,001,002,130,122) id2 <- c(121,122,123,456,125,126,127,678,129,130,131,132,890,987) df <- cbind(id1,id2) df id1 id2 [1,] 123 121 [2,] 456 122 [3,] 789 123 [4,] 122 456
id1 <- c(123,456,789,122,345,678,901,126,567,890,001,002,130,122)
id2 <- c(121,122,123,456,125,126,127,678,129,130,131,132,890,987)
df <- cbind(id1,id2)
df
id1 id2
[1,] 123 121
[2,] 456 122
[3,] 789 123
[4,] 122 456
[5,] 345 125
[6,] 678 126
[7,] 901 127
[8,] 126 678
[9,] 567 129
[10,] 890 130
[11,] 1 131
[12,] 2 132
[13,] 130 890
[14,] 122 987
forwards<-paste(V1,V2)
backwards<-paste(V2,V1)
#identifying combinations
intersect(forwards, backwards)
[1] "456 122" "122 456" "678 126" "126 678" "890 130" "130 890"
#count combinations
length(intersect(forwards, backwards))
[1] 6
id1 id2
678 126
126 678
890 130
130 890
#count should be equals to 4
因此,新计数应排除这两种情况,并按如下方式计数:
id1 <- c(123,456,789,122,345,678,901,126,567,890,001,002,130,122)
id2 <- c(121,122,123,456,125,126,127,678,129,130,131,132,890,987)
df <- cbind(id1,id2)
df
id1 id2
[1,] 123 121
[2,] 456 122
[3,] 789 123
[4,] 122 456
[5,] 345 125
[6,] 678 126
[7,] 901 127
[8,] 126 678
[9,] 567 129
[10,] 890 130
[11,] 1 131
[12,] 2 132
[13,] 130 890
[14,] 122 987
forwards<-paste(V1,V2)
backwards<-paste(V2,V1)
#identifying combinations
intersect(forwards, backwards)
[1] "456 122" "122 456" "678 126" "126 678" "890 130" "130 890"
#count combinations
length(intersect(forwards, backwards))
[1] 6
id1 id2
678 126
126 678
890 130
130 890
#count should be equals to 4
我如何才能做到这一点?以下是我使用
data.table
回答您的问题的答案。也许有人能帮我们找到一个更直接的解决方案
library(data.table)
df <- data.table(id1,id2) # get vectors as a data.table
# create forwards and backwards columns
df[ , forwards := paste(id1,id2)]
df[ , backwards := paste(id2,id1)]
# count number of intersections between forwards and backwards
df [ forwards %in% backwards, .(count=.N)]
> count
> 1: 6
库(data.table)
测向计数
> 1: 6
这就是你要问的,棘手的部分
# add new column with number of pairs of id1
df[ , pairs :=.N, by= id1]
# get all values that have more than one pair
too_many_pairs <- as.matrix(df[ pairs >1, .(id1,id2) ])
# solution
df[ id1 %in% id2 & id2 %in% id1 & !(id1 %in% too_many_pairs) ]
> id1 id2
> 1: 678 126
> 2: 126 678
> 3: 890 130
> 4: 130 890
#添加具有id1对数的新列
df[,pairs:=.N,by=id1]
#获取具有多个对的所有值
太多对(id1,id2)])
#解决方案
df[id1%在%id2中,id2%在%id1&!(id1%在%too\u many\u pairs中)]
>id1 id2
> 1: 678 126
> 2: 126 678
> 3: 890 130
> 4: 130 890
解释解决方案:
解决方案id2中的id1%和%id1中的id2%
的第一部分表示只保留id1中也可以在id2中找到的那些值,反之亦然
解决方案的第二部分
!(id1%在%too\u many\u pairs中)
表示删除具有多个对的id1的所有值以下是我使用数据解决问题的答案。表
。也许有人能帮我们找到一个更直接的解决方案
library(data.table)
df <- data.table(id1,id2) # get vectors as a data.table
# create forwards and backwards columns
df[ , forwards := paste(id1,id2)]
df[ , backwards := paste(id2,id1)]
# count number of intersections between forwards and backwards
df [ forwards %in% backwards, .(count=.N)]
> count
> 1: 6
库(data.table)
测向计数
> 1: 6
这就是你要问的,棘手的部分
# add new column with number of pairs of id1
df[ , pairs :=.N, by= id1]
# get all values that have more than one pair
too_many_pairs <- as.matrix(df[ pairs >1, .(id1,id2) ])
# solution
df[ id1 %in% id2 & id2 %in% id1 & !(id1 %in% too_many_pairs) ]
> id1 id2
> 1: 678 126
> 2: 126 678
> 3: 890 130
> 4: 130 890
#添加具有id1对数的新列
df[,pairs:=.N,by=id1]
#获取具有多个对的所有值
太多对(id1,id2)])
#解决方案
df[id1%在%id2中,id2%在%id1&!(id1%在%too\u many\u pairs中)]
>id1 id2
> 1: 678 126
> 2: 126 678
> 3: 890 130
> 4: 130 890
解释解决方案:
解决方案id2中的id1%和%id1中的id2%
的第一部分表示只保留id1中也可以在id2中找到的那些值,反之亦然
解决方案的第二部分
!(id1%在%too\u many\u pairs中)
表示删除具有多个对的id1的所有值
library(dplyr)
library(tidyr)
# make df a data.frame instead of a matrix
data.frame(df) %>%
# add row index
add_rownames() %>%
# melt to long form
gather(id, val, -rowname) %>%
# filter down to values repeated an even number of times
group_by(val) %>% filter(n() %% 2 == 0) %>%
# filter down to rows with two values
group_by(rowname) %>% filter(n() == 2) %>%
# spread back to wide form
spread(id, val)
# Source: local data frame [4 x 3]
# Groups: rowname [4]
#
# rowname id1 id2
# (chr) (dbl) (dbl)
# 1 10 890 130
# 2 13 130 890
# 3 6 678 126
# 4 8 126 678
这里有一个Hadleyverse方法:
library(dplyr)
library(tidyr)
# make df a data.frame instead of a matrix
data.frame(df) %>%
# add row index
add_rownames() %>%
# melt to long form
gather(id, val, -rowname) %>%
# filter down to values repeated an even number of times
group_by(val) %>% filter(n() %% 2 == 0) %>%
# filter down to rows with two values
group_by(rowname) %>% filter(n() == 2) %>%
# spread back to wide form
spread(id, val)
# Source: local data frame [4 x 3]
# Groups: rowname [4]
#
# rowname id1 id2
# (chr) (dbl) (dbl)
# 1 10 890 130
# 2 13 130 890
# 3 6 678 126
# 4 8 126 678
回答得好。加一,回答得好。加一。