根据R中的条件添加元素rep的列和nb
我有一个数据帧根据R中的条件添加元素rep的列和nb,r,dataframe,dplyr,R,Dataframe,Dplyr,我有一个数据帧 Groups Names COL1 COL2 COL3 COL4 1 G1 SP1 1 0.400 0.500 Sequence1 2 G1 SP1 1 0.004 0.005 Sequence2 3 G1 SP1 0 0.004 0.005 Sequence3 4 G1 SP2 0 0.400 0.005 Sequence123 5 G1 SP2
Groups Names COL1 COL2 COL3 COL4
1 G1 SP1 1 0.400 0.500 Sequence1
2 G1 SP1 1 0.004 0.005 Sequence2
3 G1 SP1 0 0.004 0.005 Sequence3
4 G1 SP2 0 0.400 0.005 Sequence123
5 G1 SP2 0 0.004 0.500 Sequence14
6 G1 SP3 0 0.005 0.006 Sequence15
7 G1 SP5 1 0.400 0.006 Sequence16
8 G1 SP6 1 0.008 0.002 Sequence20
10 G2 Sp1 0 0.004 0.005 Sequence17
11 G2 SP1 0 0.050 0.600 Sequence18
12 G2 SP1 0 0.400 0.600 Sequence3
13 G2 SP2 0 0.004 0.005 Sequence22
14 G2 SP2 0 0.004 0.005 Sequence23
15 G2 SP5 0 0.004 0.005 Sequence16
16 G2 SP6 0 0.003 0.002 Sequence21
17 G2 SP7 0 0.560 0.760 Sequence67
18 G3 SP5 0 0.87 0.767 Sequence16
我想添加一个新列COL5
其中,我为组中的每个名称添加一个1 if,组中有共享序列。
例如,让我们看看G1
SP1
具有Sequence3
,该序列出现在G1
和G2
中,因此我为第3行和第12行设置了rep的数量。这里(2)
与SP5
相同,它在G1
中有Sequence15
,在G2
和G3
中有Sequence15
,(此处代表的数量为3)
因此,对于我应该得到的所有数据帧:
Groups Names COL1 COL2 COL3 COL4 COL5
1 G1 SP1 1 0.400 0.500 Sequence1 0
2 G1 SP1 1 0.004 0.005 Sequence2 0
3 G1 SP1 0 0.004 0.005 Sequence3 2
4 G1 SP2 0 0.400 0.005 Sequence123 0
5 G1 SP2 0 0.004 0.500 Sequence14 0
6 G1 SP3 0 0.005 0.006 Sequence15 0
7 G1 SP5 1 0.400 0.006 Sequence16 3
8 G1 SP6 1 0.008 0.002 Sequence20 0
10 G2 Sp1 0 0.004 0.005 Sequence17 0
11 G2 SP1 0 0.050 0.600 Sequence18 0
12 G2 SP1 0 0.400 0.600 Sequence3 2
13 G2 SP2 0 0.004 0.005 Sequence22 0
14 G2 SP2 0 0.004 0.005 Sequence23 0
15 G2 SP5 0 0.004 0.005 Sequence16 3
16 G2 SP6 0 0.003 0.002 Sequence21 0
17 G2 SP7 0 0.560 0.760 Sequence67 0
18 G3 SP5 0 0.87 0.767 Sequence16 3
以下是dput
:
dput(test_df)
structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,3L), .Label = c("G1", "G2","G3"), class = "factor"),
Names = structure(c(2L, 2L, 2L, 3L, 3L, 4L, 5L, 6L, 1L, 2L,
2L, 3L, 3L, 5L, 6L, 7L,5L), .Label = c("Sp1", "SP1", "SP2",
"SP3", "SP5", "SP6", "SP7","SP5"), class = "factor"), COL1 = c(1L,
1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
,0L), COL2 = c(0.4, 0.004, 0.004, 0.4, 0.004, 0.005, 0.4, 0.008,
0.004, 0.05, 0.4, 0.004, 0.004, 0.004, 0.003, 0.56,0.87), COL3 = c(0.5,
0.005, 0.005, 0.005, 0.5, 0.006, 0.006, 0.002, 0.005, 0.6,
0.6, 0.005, 0.005, 0.005, 0.002, 0.76,0.767 ), COL4 = structure(c(1L,
8L, 13L, 2L, 3L, 4L, 5L, 9L, 6L, 7L, 13L, 11L, 12L, 5L, 10L,
14L), .Label = c("Sequence1", "Sequence123", "Sequence14",
"Sequence15", "Sequence16", "Sequence17", "Sequence18", "Sequence2",
"Sequence20", "Sequence21", "Sequence22", "Sequence23", "Sequence3",
"Sequence67","Sequence16"), class = "factor")), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "10", "11", "12", "13", "14",
"15", "16", "17","18"))
我们可以为每个COL4
值计算唯一组的数量
,如果它们大于1,则分配1/0值
library(dplyr)
test_df %>%
group_by(COL4, Names) %>%
mutate(COL5 = {ind = n_distinct(Groups); if(ind > 1) ind else 0})
# Groups Names COL1 COL2 COL3 COL4 COL5
# <chr> <chr> <int> <dbl> <dbl> <chr> <dbl>
# 1 G1 SP1 1 0.4 0.5 Sequence1 0
# 2 G1 SP1 1 0.004 0.005 Sequence2 0
# 3 G1 SP1 0 0.004 0.005 Sequence3 2
# 4 G1 SP2 0 0.4 0.005 Sequence123 0
# 5 G1 SP2 0 0.004 0.5 Sequence14 0
# 6 G1 SP3 0 0.005 0.006 Sequence15 0
# 7 G1 SP5 1 0.4 0.006 Sequence16 3
# 8 G1 SP6 1 0.008 0.002 Sequence20 0
# 9 G2 Sp1 0 0.004 0.005 Sequence17 0
#10 G2 SP1 0 0.05 0.6 Sequence18 0
#11 G2 SP1 0 0.4 0.6 Sequence3 2
#12 G2 SP2 0 0.004 0.005 Sequence22 0
#13 G2 SP2 0 0.004 0.005 Sequence23 0
#14 G2 SP5 0 0.004 0.005 Sequence16 3
#15 G2 SP6 0 0.003 0.002 Sequence21 0
#16 G2 SP7 0 0.56 0.76 Sequence67 0
#17 G3 SP5 0 0.87 0.767 Sequence16 3
在R基中:
test_df$COL5 <- with(test_df, as.integer(ave(as.character(Groups), COL4, Names,
FUN = function(x) length(unique(x)))))
test_df$COL5[test_df$COL5 == 1] <- 0
test_df$COL5我们可以为每个COL4
值计算唯一组的数量
,如果它们大于1,则分配1/0值
library(dplyr)
test_df %>%
group_by(COL4, Names) %>%
mutate(COL5 = {ind = n_distinct(Groups); if(ind > 1) ind else 0})
# Groups Names COL1 COL2 COL3 COL4 COL5
# <chr> <chr> <int> <dbl> <dbl> <chr> <dbl>
# 1 G1 SP1 1 0.4 0.5 Sequence1 0
# 2 G1 SP1 1 0.004 0.005 Sequence2 0
# 3 G1 SP1 0 0.004 0.005 Sequence3 2
# 4 G1 SP2 0 0.4 0.005 Sequence123 0
# 5 G1 SP2 0 0.004 0.5 Sequence14 0
# 6 G1 SP3 0 0.005 0.006 Sequence15 0
# 7 G1 SP5 1 0.4 0.006 Sequence16 3
# 8 G1 SP6 1 0.008 0.002 Sequence20 0
# 9 G2 Sp1 0 0.004 0.005 Sequence17 0
#10 G2 SP1 0 0.05 0.6 Sequence18 0
#11 G2 SP1 0 0.4 0.6 Sequence3 2
#12 G2 SP2 0 0.004 0.005 Sequence22 0
#13 G2 SP2 0 0.004 0.005 Sequence23 0
#14 G2 SP5 0 0.004 0.005 Sequence16 3
#15 G2 SP6 0 0.003 0.002 Sequence21 0
#16 G2 SP7 0 0.56 0.76 Sequence67 0
#17 G3 SP5 0 0.87 0.767 Sequence16 3
在R基中:
test_df$COL5 <- with(test_df, as.integer(ave(as.character(Groups), COL4, Names,
FUN = function(x) length(unique(x)))))
test_df$COL5[test_df$COL5 == 1] <- 0
test_df$COL5我们也可以
library(dplyr)
test_df %>%
group_by(COL4, Names) %>%
mutate(COL5 = n_distinct(Groups) * (n_distinct(Groups) > 1))
或使用数据。表格
library(data.table)
setDT(test_df)[, COL5 := uniqueN(Groups) * (uniqueN(Groups) > 1), .(COL4, Names)]
我们也可以这样做
library(dplyr)
test_df %>%
group_by(COL4, Names) %>%
mutate(COL5 = n_distinct(Groups) * (n_distinct(Groups) > 1))
或使用数据。表格
library(data.table)
setDT(test_df)[, COL5 := uniqueN(Groups) * (uniqueN(Groups) > 1), .(COL4, Names)]
您不考虑代码中的“名称”列吗?我的意思是Sequence21在G1和G2中都有,但在不同的物种中(SP1和SP6)哦,我明白了。我认为显示的输入和输出数据帧是不同的。例如,Sequence21在输入中只有一次,但在输出中有两次。也许,在groupby
中添加Names
列可以解决这个问题。事实上,我稍微改变了一下问题,现在不是只添加1或2,我想添加rep的数量(例如,最少2个),如果没有rep(我更新了表),则为零。为什么Sequence15
2?它只存在一次。是否不考虑代码中的“名称”列?我的意思是Sequence21在G1和G2中都有,但在不同的物种中(SP1和SP6)哦,我明白了。我认为显示的输入和输出数据帧是不同的。例如,Sequence21在输入中只有一次,但在输出中有两次。也许,在groupby
中添加Names
列可以解决这个问题。事实上,我稍微改变了一下问题,现在不是只添加1或2,我想添加rep的数量(例如,最少2个),如果没有rep(我更新了表),则为零。为什么Sequence15
2?它只存在一次。