按多个列名对dataframe进行子集,并返回前n个命中

按多个列名对dataframe进行子集,并返回前n个命中,r,R,这是我的玩具数据框,真实的可能有40K-1M记录和五个附加列 animal1 version1 animal2 version2 sim 53 20154620 TRUSEQ.v1 20104647 F250v1 0.3663569 854 20145687 TRUSEQ.v1 20105551 F250v1 0.5732854 3662 20154620 TR

这是我的玩具数据框,真实的可能有40K-1M记录和五个附加列

        animal1     version1    animal2     version2    sim             
53      20154620    TRUSEQ.v1   20104647    F250v1  0.3663569
854     20145687    TRUSEQ.v1   20105551    F250v1  0.5732854
3662    20154620    TRUSEQ.v1   20114509    F250v1  0.3374918
4063    20154620    TRUSEQ.v1   20114578    F250v1  0.3732692
4464    20154620    TRUSEQ.v1   20114595    F250v1  0.3772367
5262    20144516    TRUSEQ.v1   20115051    770k.v1 0.6034206
5663    20144516    TRUSEQ.v1   20115051    F250v1  0.6164795
5664    20145008    TRUSEQ.v1   20115051    F250v1  0.3146651
6064    20144516    TRUSEQ.v1   20115059    F250v1  0.3043295
6471    20165119    F250v1      20115096    F250v1  0.388435
9677    20154620    TRUSEQ.v1   20118095    F250v1  0.3079702
11281   20154620    TRUSEQ.v1   20134529    F250v1  0.3188631
12486   20165119    F250v1      20135032    F250v1  0.6091486
13282   20144516    TRUSEQ.v1   20135047    F250v1  0.3098507
14090   20165119    F250v1      20135072    F250v1  0.3025007
14892   20165119    F250v1      20135122    F250v1  0.345238
对于每个animal1,我需要所有行按最高模拟显示前3个唯一的animal2值。。。下面是我想要的结果

        animal1     version1    animal2     version2    sim
5663    20144516    TRUSEQ.v1   20115051    F250v1  0.6164795
5262    20144516    TRUSEQ.v1   20115051    770k.v1 0.6034206
13282   20144516    TRUSEQ.v1   20135047    F250v1  0.3098507
6064    20144516    TRUSEQ.v1   20115059    F250v1  0.3043295
5664    20145008    TRUSEQ.v1   20115051    F250v1  0.3146651
854     20145687    TRUSEQ.v1   20105551    F250v1  0.5732854
4464    20154620    TRUSEQ.v1   20114595    F250v1  0.3772367
4063    20154620    TRUSEQ.v1   20114578    F250v1  0.3732692
53      20154620    TRUSEQ.v1   20104647    F250v1  0.3663569
12486   20165119    F250v1      20135032    F250v1  0.6091486
6471    20165119    F250v1      20115096    F250v1  0.388435
14892   20165119    F250v1      20135122    F250v1  0.345238

因此,在子集中,每个动物1可能有1到20个观察值,但将有尽管我对动物“20144516”提出了问题,这里有几个解决方案,使用
dat
作为上面的样本数据(包含在底部,用于再现性)。我提供的是base-R和
dplyr
,不过正如@Balter的评论所建议的那样,可能也有一个直接的
数据表
方法

基尔 (不幸的是,行名被屏蔽了。如果它们有意义,我建议您将
rownames(dat)
放在一列中并保存在那里。)

dplyr
您也可以使用
dplyr

library(dplyr)
dat %>%
  group_by(animal1, animal2) %>%
  top_n(1, wt = sim) %>%
  group_by(animal1) %>%
  top_n(3, wt = sim) %>%
  ungroup()
# # A tibble: 11 × 5
#     animal1  version1  animal2 version2       sim
#       <int>    <fctr>    <int>   <fctr>     <dbl>
# 1  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
# 2  20165119    F250v1 20135032   F250v1 0.6091486
# 3  20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
# 4  20165119    F250v1 20115096   F250v1 0.3884350
# 5  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
# 6  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
# 7  20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
# 8  20165119    F250v1 20135122   F250v1 0.3452380
# 9  20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
# 10 20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
# 11 20144516 TRUSEQ.v1 20115059   F250v1 0.3043295
这种差异随着不同动物数量的增加而增加。(这是因为
Reduce
方法为每个独特的动物调用
rbind
一次,而
do.call
只调用
rbind
一次。)

此处使用的示例数据:

dat <- structure(list(animal1 = c(20154620L, 20145687L, 20154620L, 20154620L, 
20154620L, 20144516L, 20144516L, 20145008L, 20144516L, 20165119L, 
20154620L, 20154620L, 20165119L, 20144516L, 20165119L, 20165119L
), version1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("F250v1", "TRUSEQ.v1"
), class = "factor"), animal2 = c(20104647L, 20105551L, 20114509L, 
20114578L, 20114595L, 20115051L, 20115051L, 20115051L, 20115059L, 
20115096L, 20118095L, 20134529L, 20135032L, 20135047L, 20135072L, 
20135122L), version2 = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("770k.v1", "F250v1"
), class = "factor"), sim = c(0.3663569, 0.5732854, 0.3374918, 
0.3732692, 0.3772367, 0.6034206, 0.6164795, 0.3146651, 0.3043295, 
0.388435, 0.3079702, 0.3188631, 0.6091486, 0.3098507, 0.3025007, 
0.345238)), .Names = c("animal1", "version1", "animal2", "version2", 
"sim"), class = "data.frame", row.names = c("53", "854", "3662", 
"4063", "4464", "5262", "5663", "5664", "6064", "6471", "9677", 
"11281", "12486", "13282", "14090", "14892"))

dat以下代码将包括多个
animal1
-
animal2
组合,前提是较低
animal1
-
animal2
条目的
sim
值“将在前3位”。如果我误解了,请告诉我

library(dplyr)
selected <- dat %>% 
  arrange(animal1,animal2,desc(sim)) %>%
  group_by(animal1,animal2) %>% 
  mutate(rank=row_number()) %>%
  filter(rank==1) %>% ungroup() %>%
  group_by(animal1) %>%
  top_n(3,sim) %>% 
  summarise(threshold = min(sim))

out <- dat %>% 
  inner_join(selected, by = c("animal1"="animal1")) %>%
  filter(sim>=threshold) %>%
  arrange(animal1,animal2,desc(sim)) %>%
  select(-threshold)
> out
    animal1  version1  animal2 version2       sim
1  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
2  20144516 TRUSEQ.v1 20115051  770k.v1 0.6034206
3  20144516 TRUSEQ.v1 20115059   F250v1 0.3043295
4  20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
5  20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
6  20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
7  20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
8  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
9  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
10 20165119    F250v1 20115096   F250v1 0.3884350
11 20165119    F250v1 20135032   F250v1 0.6091486
12 20165119    F250v1 20135122   F250v1 0.3452380
库(dplyr)
选定百分比
排列(动物1、动物2、描述(模拟))%>%
分组依据(动物1,动物2)%>%
变异(秩=行数())%>%
筛选器(秩==1)%%>%ungroup()%%>%
分组依据(动物1)%>%
排名靠前的(3,sim)%>%
总结(阈值=最小值(sim))
超出%
内部联接(选定,由=c(“animal1”=“animal1”))%>%
过滤器(sim>=阈值)%%>%
排列(动物1、动物2、描述(模拟))%>%
选择(-threshold)
>出去
动物1版本1动物2版本2模拟
1 20144516 TRUSEQ.v1 20115051 F250v1 0.6164795
2 20144516 TRUSEQ.v1 20115051 770k.v1 0.6034206
3 20144516 TRUSEQ.v1 20115059 F250v1 0.3043295
4 20144516 TRUSEQ.v1 20135047 F250v1 0.3098507
5 20145008 TRUSEQ.v1 20115051 F250v1 0.3146651
6 20145687 TRUSEQ.v1 20105551 F250v1 0.5732854
7 20154620 TRUSEQ.v1 20104647 F250v1 0.3663569
8 20154620 TRUSEQ.v1 20114578 F250v1 0.3732692
9 20154620 TRUSEQ.v1 20114595 F250v1 0.3772367
10 20165119 F250v1 20115096 F250v1 0.3884350
11 20165119 F250v1 20135032 F250v1 0.6091486
12 20165119 F250v1 20135122 F250v1 0.3452380

我认为data.table软件包有这样的功能。你说你想要“最高sim的前三个唯一的animal2值”,但是你的预期输出有4个用于
20144516
,这是snafu还是未说明的要求?animal1=20144516的三个唯一的animal2值是20115051、20135047和20115059。应该是对的。Animal2=20115051只有两条记录。据我所知,该解决方案通过为每个animal1-Animal2组合计算的阈值最小sim值工作,其中animal1-Animal2组合是设置n=3的标准。是这样吗?工作起来很有魅力@是的,没错。如果冗余对恰好高于第三个唯一值,那么这似乎符合您包含冗余对的意图。希望能有帮助
dat3 <- do.call(rbind, by(dat, list(dat$animal1), head, n = 3))
dat3
#                 animal1  version1  animal2 version2       sim
# 20144516.5663  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
# 20144516.5262  20144516 TRUSEQ.v1 20115051  770k.v1 0.6034206
# 20144516.13282 20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
# 20145008       20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
# 20145687       20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
# 20154620.4464  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
# 20154620.4063  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
# 20154620.53    20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
# 20165119.12486 20165119    F250v1 20135032   F250v1 0.6091486
# 20165119.6471  20165119    F250v1 20115096   F250v1 0.3884350
# 20165119.14892 20165119    F250v1 20135122   F250v1 0.3452380
library(dplyr)
dat %>%
  group_by(animal1, animal2) %>%
  top_n(1, wt = sim) %>%
  group_by(animal1) %>%
  top_n(3, wt = sim) %>%
  ungroup()
# # A tibble: 11 × 5
#     animal1  version1  animal2 version2       sim
#       <int>    <fctr>    <int>   <fctr>     <dbl>
# 1  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
# 2  20165119    F250v1 20135032   F250v1 0.6091486
# 3  20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
# 4  20165119    F250v1 20115096   F250v1 0.3884350
# 5  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
# 6  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
# 7  20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
# 8  20165119    F250v1 20135122   F250v1 0.3452380
# 9  20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
# 10 20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
# 11 20144516 TRUSEQ.v1 20115059   F250v1 0.3043295
library(microbenchmark)
x <- by(dat, list(dat$animal1, dat$animal2), head, n = 1)
microbenchmark(
  docall = do.call(rbind, x),
  reduce = Reduce(rbind, x)
)
# Unit: milliseconds
#    expr       min        lq      mean    median        uq       max neval
#  docall  1.418577  1.493335  1.809469  1.551136  1.731466  5.216277   100
#  reduce 11.119961 11.829614 13.302388 12.727255 13.401535 26.897520   100
dat <- structure(list(animal1 = c(20154620L, 20145687L, 20154620L, 20154620L, 
20154620L, 20144516L, 20144516L, 20145008L, 20144516L, 20165119L, 
20154620L, 20154620L, 20165119L, 20144516L, 20165119L, 20165119L
), version1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("F250v1", "TRUSEQ.v1"
), class = "factor"), animal2 = c(20104647L, 20105551L, 20114509L, 
20114578L, 20114595L, 20115051L, 20115051L, 20115051L, 20115059L, 
20115096L, 20118095L, 20134529L, 20135032L, 20135047L, 20135072L, 
20135122L), version2 = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("770k.v1", "F250v1"
), class = "factor"), sim = c(0.3663569, 0.5732854, 0.3374918, 
0.3732692, 0.3772367, 0.6034206, 0.6164795, 0.3146651, 0.3043295, 
0.388435, 0.3079702, 0.3188631, 0.6091486, 0.3098507, 0.3025007, 
0.345238)), .Names = c("animal1", "version1", "animal2", "version2", 
"sim"), class = "data.frame", row.names = c("53", "854", "3662", 
"4063", "4464", "5262", "5663", "5664", "6064", "6471", "9677", 
"11281", "12486", "13282", "14090", "14892"))
library(dplyr)
selected <- dat %>% 
  arrange(animal1,animal2,desc(sim)) %>%
  group_by(animal1,animal2) %>% 
  mutate(rank=row_number()) %>%
  filter(rank==1) %>% ungroup() %>%
  group_by(animal1) %>%
  top_n(3,sim) %>% 
  summarise(threshold = min(sim))

out <- dat %>% 
  inner_join(selected, by = c("animal1"="animal1")) %>%
  filter(sim>=threshold) %>%
  arrange(animal1,animal2,desc(sim)) %>%
  select(-threshold)
> out
    animal1  version1  animal2 version2       sim
1  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
2  20144516 TRUSEQ.v1 20115051  770k.v1 0.6034206
3  20144516 TRUSEQ.v1 20115059   F250v1 0.3043295
4  20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
5  20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
6  20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
7  20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
8  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
9  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
10 20165119    F250v1 20115096   F250v1 0.3884350
11 20165119    F250v1 20135032   F250v1 0.6091486
12 20165119    F250v1 20135122   F250v1 0.3452380