计算并识别R中的唯一关系_R_Count

计算并识别R中的唯一关系

计算并识别R中的唯一关系,r,count,R,Count,我有一个数据框，有两列，如下所示： id1 <- c(123,456,789,122,345,678,901,126,567,890,001,002,130,122) id2 <- c(121,122,123,456,125,126,127,678,129,130,131,132,890,987) df <- cbind(id1,id2) df id1 id2 [1,] 123 121 [2,] 456 122 [3,] 789 123 [4,] 122 456

我有一个数据框，有两列，如下所示：

id1 <- c(123,456,789,122,345,678,901,126,567,890,001,002,130,122)
id2 <- c(121,122,123,456,125,126,127,678,129,130,131,132,890,987)
df <- cbind(id1,id2)
df
  id1 id2
 [1,] 123 121
 [2,] 456 122
 [3,] 789 123
 [4,] 122 456
 [5,] 345 125
 [6,] 678 126
 [7,] 901 127
 [8,] 126 678
 [9,] 567 129
[10,] 890 130
[11,]   1 131
[12,]   2 132
[13,] 130 890
[14,] 122 987

forwards<-paste(V1,V2)
backwards<-paste(V2,V1)

#identifying combinations

intersect(forwards, backwards)
[1] "456 122" "122 456" "678 126" "126 678" "890 130" "130 890"

#count combinations
length(intersect(forwards, backwards))
[1] 6

  id1 id2
  678 126
  126 678
  890 130
  130 890
  #count should be equals to 4

因此，新计数应排除这两种情况，并按如下方式计数：

id1 <- c(123,456,789,122,345,678,901,126,567,890,001,002,130,122)
id2 <- c(121,122,123,456,125,126,127,678,129,130,131,132,890,987)
df <- cbind(id1,id2)
df
  id1 id2
 [1,] 123 121
 [2,] 456 122
 [3,] 789 123
 [4,] 122 456
 [5,] 345 125
 [6,] 678 126
 [7,] 901 127
 [8,] 126 678
 [9,] 567 129
[10,] 890 130
[11,]   1 131
[12,]   2 132
[13,] 130 890
[14,] 122 987

forwards<-paste(V1,V2)
backwards<-paste(V2,V1)

#identifying combinations

intersect(forwards, backwards)
[1] "456 122" "122 456" "678 126" "126 678" "890 130" "130 890"

#count combinations
length(intersect(forwards, backwards))
[1] 6

  id1 id2
  678 126
  126 678
  890 130
  130 890
  #count should be equals to 4

我如何才能做到这一点？

以下是我使用

data.table

回答您的问题的答案。也许有人能帮我们找到一个更直接的解决方案

library(data.table)
df <- data.table(id1,id2) # get vectors as a data.table

# create forwards and backwards  columns
  df[ , forwards := paste(id1,id2)]
  df[ , backwards := paste(id2,id1)]

# count number of intersections between forwards and backwards  
  df [ forwards %in% backwards, .(count=.N)]

>    count
> 1:     6

库（data.table）
测向计数
> 1:     6

这就是你要问的，棘手的部分

# add new column with number of pairs of id1
  df[ , pairs :=.N, by= id1]

# get all values that have more than one pair
  too_many_pairs <-  as.matrix(df[ pairs >1, .(id1,id2) ])

# solution
  df[  id1 %in% id2 & id2 %in% id1 & !(id1 %in% too_many_pairs) ]

>    id1 id2 
> 1: 678 126 
> 2: 126 678 
> 3: 890 130 
> 4: 130 890

#添加具有id1对数的新列
df[，pairs:=.N，by=id1]
#获取具有多个对的所有值
太多对（id1，id2）]）
#解决方案
df[id1%在%id2中，id2%在%id1&！（id1%在%too\u many\u pairs中）]
>id1 id2
> 1: 678 126 
> 2: 126 678 
> 3: 890 130 
> 4: 130 890

解释解决方案：

解决方案

id2中的id1%和%id1中的id2%

的第一部分表示只保留id1中也可以在id2中找到的那些值，反之亦然

解决方案的第二部分
！（id1%在%too\u many\u pairs中）
表示删除具有多个对的id1的所有值
以下是我使用
数据解决问题的答案。表
。也许有人能帮我们找到一个更直接的解决方案

library(data.table) df <- data.table(id1,id2) # get vectors as a data.table # create forwards and backwards columns df[ , forwards := paste(id1,id2)] df[ , backwards := paste(id2,id1)] # count number of intersections between forwards and backwards df [ forwards %in% backwards, .(count=.N)] > count > 1: 6

库（data.table）测向计数 > 1: 6
这就是你要问的，棘手的部分

# add new column with number of pairs of id1 df[ , pairs :=.N, by= id1] # get all values that have more than one pair too_many_pairs <- as.matrix(df[ pairs >1, .(id1,id2) ]) # solution df[ id1 %in% id2 & id2 %in% id1 & !(id1 %in% too_many_pairs) ] > id1 id2 > 1: 678 126 > 2: 126 678 > 3: 890 130 > 4: 130 890

#添加具有id1对数的新列 df[，pairs:=.N，by=id1] #获取具有多个对的所有值太多对（id1，id2）]） #解决方案 df[id1%在%id2中，id2%在%id1&！（id1%在%too\u many\u pairs中）] >id1 id2 > 1: 678 126 > 2: 126 678 > 3: 890 130 > 4: 130 890
解释解决方案：
解决方案
id2中的id1%和%id1中的id2%
的第一部分表示只保留id1中也可以在id2中找到的那些值，反之亦然

解决方案的第二部分
！（id1%在%too\u many\u pairs中）
表示删除具有多个对的id1的所有值

library(dplyr) library(tidyr) # make df a data.frame instead of a matrix data.frame(df) %>% # add row index add_rownames() %>% # melt to long form gather(id, val, -rowname) %>% # filter down to values repeated an even number of times group_by(val) %>% filter(n() %% 2 == 0) %>% # filter down to rows with two values group_by(rowname) %>% filter(n() == 2) %>% # spread back to wide form spread(id, val) # Source: local data frame [4 x 3] # Groups: rowname [4] # # rowname id1 id2 # (chr) (dbl) (dbl) # 1 10 890 130 # 2 13 130 890 # 3 6 678 126 # 4 8 126 678

这里有一个Hadleyverse方法：

library(dplyr) library(tidyr) # make df a data.frame instead of a matrix data.frame(df) %>% # add row index add_rownames() %>% # melt to long form gather(id, val, -rowname) %>% # filter down to values repeated an even number of times group_by(val) %>% filter(n() %% 2 == 0) %>% # filter down to rows with two values group_by(rowname) %>% filter(n() == 2) %>% # spread back to wide form spread(id, val) # Source: local data frame [4 x 3] # Groups: rowname [4] # # rowname id1 id2 # (chr) (dbl) (dbl) # 1 10 890 130 # 2 13 130 890 # 3 6 678 126 # 4 8 126 678

回答得好。加一，回答得好。加一。