根据R中的条件添加元素rep的列和nb

根据R中的条件添加元素rep的列和nb,r,dataframe,dplyr,R,Dataframe,Dplyr,我有一个数据帧 Groups Names COL1 COL2 COL3 COL4 1 G1 SP1 1 0.400 0.500 Sequence1 2 G1 SP1 1 0.004 0.005 Sequence2 3 G1 SP1 0 0.004 0.005 Sequence3 4 G1 SP2 0 0.400 0.005 Sequence123 5 G1 SP2

我有一个数据帧

   Groups Names COL1  COL2  COL3        COL4
1      G1   SP1    1 0.400 0.500   Sequence1
2      G1   SP1    1 0.004 0.005   Sequence2
3      G1   SP1    0 0.004 0.005   Sequence3
4      G1   SP2    0 0.400 0.005 Sequence123
5      G1   SP2    0 0.004 0.500  Sequence14
6      G1   SP3    0 0.005 0.006  Sequence15
7      G1   SP5    1 0.400 0.006  Sequence16
8      G1   SP6    1 0.008 0.002  Sequence20
10     G2   Sp1    0 0.004 0.005  Sequence17
11     G2   SP1    0 0.050 0.600  Sequence18
12     G2   SP1    0 0.400 0.600   Sequence3
13     G2   SP2    0 0.004 0.005  Sequence22
14     G2   SP2    0 0.004 0.005  Sequence23
15     G2   SP5    0 0.004 0.005  Sequence16
16     G2   SP6    0 0.003 0.002  Sequence21
17     G2   SP7    0 0.560 0.760  Sequence67
18     G3   SP5    0 0.87  0.767  Sequence16
我想添加一个新列
COL5

其中,我为组中的每个名称添加一个1 if,组中有共享序列。 例如,让我们看看G1

SP1
具有
Sequence3
,该序列出现在
G1
G2
中,因此我为
第3行和第12行设置了rep的数量。这里(2)

SP5
相同,它在
G1
中有
Sequence15
,在
G2
G3
中有
Sequence15
,(此处代表的数量为3)

因此,对于我应该得到的所有数据帧:

   Groups Names COL1  COL2  COL3        COL4 COL5
1      G1   SP1    1 0.400 0.500   Sequence1 0
2      G1   SP1    1 0.004 0.005   Sequence2 0
3      G1   SP1    0 0.004 0.005   Sequence3 2
4      G1   SP2    0 0.400 0.005 Sequence123 0
5      G1   SP2    0 0.004 0.500  Sequence14 0
6      G1   SP3    0 0.005 0.006  Sequence15 0
7      G1   SP5    1 0.400 0.006  Sequence16 3
8      G1   SP6    1 0.008 0.002  Sequence20 0
10     G2   Sp1    0 0.004 0.005  Sequence17 0
11     G2   SP1    0 0.050 0.600  Sequence18 0
12     G2   SP1    0 0.400 0.600   Sequence3 2
13     G2   SP2    0 0.004 0.005  Sequence22 0
14     G2   SP2    0 0.004 0.005  Sequence23 0
15     G2   SP5    0 0.004 0.005  Sequence16 3
16     G2   SP6    0 0.003 0.002  Sequence21 0
17     G2   SP7    0 0.560 0.760  Sequence67 0
18     G3   SP5    0 0.87  0.767  Sequence16 3
以下是
dput

dput(test_df)
structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,3L), .Label = c("G1", "G2","G3"), class = "factor"), 
    Names = structure(c(2L, 2L, 2L, 3L, 3L, 4L, 5L, 6L, 1L, 2L, 
    2L, 3L, 3L, 5L, 6L, 7L,5L), .Label = c("Sp1", "SP1", "SP2", 
    "SP3", "SP5", "SP6", "SP7","SP5"), class = "factor"), COL1 = c(1L, 
    1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
    ,0L), COL2 = c(0.4, 0.004, 0.004, 0.4, 0.004, 0.005, 0.4, 0.008, 
    0.004, 0.05, 0.4, 0.004, 0.004, 0.004, 0.003, 0.56,0.87), COL3 = c(0.5, 
    0.005, 0.005, 0.005, 0.5, 0.006, 0.006, 0.002, 0.005, 0.6, 
    0.6, 0.005, 0.005, 0.005, 0.002, 0.76,0.767 ), COL4 = structure(c(1L, 
    8L, 13L, 2L, 3L, 4L, 5L, 9L, 6L, 7L, 13L, 11L, 12L, 5L, 10L, 
    14L), .Label = c("Sequence1", "Sequence123", "Sequence14", 
    "Sequence15", "Sequence16", "Sequence17", "Sequence18", "Sequence2", 
    "Sequence20", "Sequence21", "Sequence22", "Sequence23", "Sequence3", 
    "Sequence67","Sequence16"), class = "factor")), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "10", "11", "12", "13", "14", 
"15", "16", "17","18"))

我们可以为每个
COL4
值计算唯一
组的数量
,如果它们大于1,则分配1/0值

library(dplyr)
test_df %>% 
 group_by(COL4, Names) %>% 
 mutate(COL5 = {ind = n_distinct(Groups); if(ind > 1) ind else 0})


#   Groups Names  COL1  COL2  COL3 COL4         COL5
#   <chr>  <chr> <int> <dbl> <dbl> <chr>       <dbl>
# 1 G1     SP1       1 0.4   0.5   Sequence1       0
# 2 G1     SP1       1 0.004 0.005 Sequence2       0
# 3 G1     SP1       0 0.004 0.005 Sequence3       2
# 4 G1     SP2       0 0.4   0.005 Sequence123     0
# 5 G1     SP2       0 0.004 0.5   Sequence14      0
# 6 G1     SP3       0 0.005 0.006 Sequence15      0
# 7 G1     SP5       1 0.4   0.006 Sequence16      3
# 8 G1     SP6       1 0.008 0.002 Sequence20      0
# 9 G2     Sp1       0 0.004 0.005 Sequence17      0
#10 G2     SP1       0 0.05  0.6   Sequence18      0
#11 G2     SP1       0 0.4   0.6   Sequence3       2
#12 G2     SP2       0 0.004 0.005 Sequence22      0
#13 G2     SP2       0 0.004 0.005 Sequence23      0
#14 G2     SP5       0 0.004 0.005 Sequence16      3
#15 G2     SP6       0 0.003 0.002 Sequence21      0
#16 G2     SP7       0 0.56  0.76  Sequence67      0
#17 G3     SP5       0 0.87  0.767 Sequence16      3
在R基中:

test_df$COL5 <- with(test_df, as.integer(ave(as.character(Groups), COL4, Names,
                         FUN = function(x) length(unique(x)))))
test_df$COL5[test_df$COL5 == 1] <- 0

test_df$COL5我们可以为每个
COL4
值计算唯一
组的数量
,如果它们大于1,则分配1/0值

library(dplyr)
test_df %>% 
 group_by(COL4, Names) %>% 
 mutate(COL5 = {ind = n_distinct(Groups); if(ind > 1) ind else 0})


#   Groups Names  COL1  COL2  COL3 COL4         COL5
#   <chr>  <chr> <int> <dbl> <dbl> <chr>       <dbl>
# 1 G1     SP1       1 0.4   0.5   Sequence1       0
# 2 G1     SP1       1 0.004 0.005 Sequence2       0
# 3 G1     SP1       0 0.004 0.005 Sequence3       2
# 4 G1     SP2       0 0.4   0.005 Sequence123     0
# 5 G1     SP2       0 0.004 0.5   Sequence14      0
# 6 G1     SP3       0 0.005 0.006 Sequence15      0
# 7 G1     SP5       1 0.4   0.006 Sequence16      3
# 8 G1     SP6       1 0.008 0.002 Sequence20      0
# 9 G2     Sp1       0 0.004 0.005 Sequence17      0
#10 G2     SP1       0 0.05  0.6   Sequence18      0
#11 G2     SP1       0 0.4   0.6   Sequence3       2
#12 G2     SP2       0 0.004 0.005 Sequence22      0
#13 G2     SP2       0 0.004 0.005 Sequence23      0
#14 G2     SP5       0 0.004 0.005 Sequence16      3
#15 G2     SP6       0 0.003 0.002 Sequence21      0
#16 G2     SP7       0 0.56  0.76  Sequence67      0
#17 G3     SP5       0 0.87  0.767 Sequence16      3
在R基中:

test_df$COL5 <- with(test_df, as.integer(ave(as.character(Groups), COL4, Names,
                         FUN = function(x) length(unique(x)))))
test_df$COL5[test_df$COL5 == 1] <- 0
test_df$COL5我们也可以

library(dplyr)
test_df %>% 
  group_by(COL4, Names) %>% 
  mutate(COL5 = n_distinct(Groups) * (n_distinct(Groups) > 1))

或使用
数据。表格

library(data.table)
setDT(test_df)[, COL5 := uniqueN(Groups) * (uniqueN(Groups) > 1), .(COL4, Names)]
我们也可以这样做

library(dplyr)
test_df %>% 
  group_by(COL4, Names) %>% 
  mutate(COL5 = n_distinct(Groups) * (n_distinct(Groups) > 1))

或使用
数据。表格

library(data.table)
setDT(test_df)[, COL5 := uniqueN(Groups) * (uniqueN(Groups) > 1), .(COL4, Names)]

您不考虑代码中的“名称”列吗?我的意思是Sequence21在G1和G2中都有,但在不同的物种中(SP1和SP6)哦,我明白了。我认为显示的输入和输出数据帧是不同的。例如,Sequence21在输入中只有一次,但在输出中有两次。也许,在
groupby
中添加
Names
列可以解决这个问题。事实上,我稍微改变了一下问题,现在不是只添加1或2,我想添加rep的数量(例如,最少2个),如果没有rep(我更新了表),则为零。为什么
Sequence15
2?它只存在一次。是否不考虑代码中的“名称”列?我的意思是Sequence21在G1和G2中都有,但在不同的物种中(SP1和SP6)哦,我明白了。我认为显示的输入和输出数据帧是不同的。例如,Sequence21在输入中只有一次,但在输出中有两次。也许,在
groupby
中添加
Names
列可以解决这个问题。事实上,我稍微改变了一下问题,现在不是只添加1或2,我想添加rep的数量(例如,最少2个),如果没有rep(我更新了表),则为零。为什么
Sequence15
2?它只存在一次。