R 如何在不比较每个值的情况下有条件地选择每个组的最高值?
我掌握的数据如下:R 如何在不比较每个值的情况下有条件地选择每个组的最高值?,r,if-statement,data.table,R,If Statement,Data.table,我掌握的数据如下: Group Gene Score direct_count secondary_count 1 AQP11 0.5566507 4 5 1 CLNS1A 0.2811747 0 2 1 RSF1 0.5469924 3 6 2 CFDP1 0.4186066
Group Gene Score direct_count secondary_count
1 AQP11 0.5566507 4 5
1 CLNS1A 0.2811747 0 2
1 RSF1 0.5469924 3 6
2 CFDP1 0.4186066 1 2
2 CHST6 0.4295135 1 3
3 ACE 0.634 1 1
3 NOS2 0.6345 1 1
4 Gene1 0.7 0 1
4 Gene2 0.61 1 0
4 Gene3 0.62 0 1
我通过组
列对基因进行分组,然后根据条件选择每组最佳基因:
dplyr
和数据。表
您不需要任何复杂的ifelse
条件
解决方案
输入
dt%
分组依据(分组)%>%
过滤器(最大(分数)-分数%
切片最大值(直接计数,n=1)%>%
切片最大值(二次计数,n=1)%>%
解组()
#>#A tibble:4 x 5
#>组基因得分直接计数二次计数
#>
#>1 AQP110.557 4 5
#>2 CHST6 0.430 1 3
#>3 ACE 0.634 1 1
#>4 NOS2 0.634 1 1
数据表
library(data.table)
dt <- dt[dt[, .I[(max(Score) - Score) < 0.05], by = Group]$V1]
dt <- dt[dt[, .I[direct_count == max(direct_count)], by = Group]$V1]
dt <- dt[dt[, .I[secondary_count == max(secondary_count)], by = Group]$V1]
dt
#> Group Gene Score direct_count secondary_count
#> 1: 1 AQP11 0.5566507 4 5
#> 2: 2 CHST6 0.4295135 1 3
#> 3: 3 ACE 0.6340000 1 1
#> 4: 3 NOS2 0.6345000 1 1
库(data.table)
dt 2:2 CHST6 0.4295135 1 3
#>3:3 ACE 0.63400001
#>4:3 NOS2 0.6345000 1
你的编辑 与问题末尾的具体问题相关:这两种方法根据您编写的规则选择CHST6
dt%
分组依据(分组)%>%
过滤器(最大(分数)-分数%
切片最大值(直接计数,n=1)%>%
切片最大值(二次计数,n=1)%>%
解组()
#>#A tible:1 x 5
#>组基因得分直接计数二次计数
#>
#>1 2 CHST6 0.599 1 6
##########数据表
库(数据表)
dt
df<- setDT(df)
new_df <- df[,
{d = dist(Score, method = 'manhattan')
if (any(d > 0.05))
ind = which.max(d)
else if (sum(max(direct_count) == direct_count) == 1L)
ind = which.max(direct_count)
else if (sum(max(secondary_count) == secondary_count) == 1L)
ind = which.max(secondary_count)
else
ind = which((outer(direct_count, direct_count, '==') & outer(secondary_count, secondary_count, '=='))[1, ])
.SD[ind]
}
, by = Group]
structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L), Gene = c("AQP11",
"CLNS1A", "RSF1", "CFDP1", "CHST6", "ACE", "NOS2", "Gene1","Gene2","Gene3"), Score = c(0.5566507,
0.2811747, 0.5269924, 0.4186066, 0.4295135, 0.634, 0.6345, 0.7, 0.62, 0.61), direct_count = c(4L,
0L, 3L, 1L, 1L, 1L, 1L, 0L, 1L, 0L), secondary_count = c(5L, 2L, 6L, 2L,
3L, 1L, 1L, 0L, 0L, 1L)), row.names = c(NA, -10L), class = c("data.table",
"data.frame"))
Group Gene Score direct_count secondary_count
1 2 CFDP1 0.5517401 1 62
2 2 CHST6 0.5989186 1 6
3 2 RNU6-758P 0.5644914 0 1
4 2 Gene1 0.5672916 0 1
5 2 TMEM170A 0.6167083 0 2
structure(list(Group = c(2L, 2L, 2L, 2L, 2L), Gene = c("CFDP1",
"CHST6", "RNU6-758P", "Gene1", "TMEM170A"), Score = c(0.551740109920502,
0.598918557167053, 0.564491391181946, 0.567291617393494, 0.616708278656006
), direct_count = c(1, 1, 0, 0, 0), secondary_count = c(62,
6, 1, 1, 2)), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
dt <- structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L), Gene = c("AQP11",
"CLNS1A", "RSF1", "CFDP1", "CHST6", "ACE", "NOS2"), Score = c(0.5566507,
0.2811747, 0.5269924, 0.4186066, 0.4295135, 0.634, 0.6345), direct_count = c(4L,
0L, 3L, 1L, 1L, 1L, 1L), secondary_count = c(5L, 2L, 6L, 2L,
3L, 1L, 1L)), row.names = c(NA, -7L), class = c("data.table",
"data.frame"))
library(dplyr)
dt %>%
group_by(Group) %>%
filter((max(Score) - Score)<0.05) %>%
slice_max(direct_count, n = 1) %>%
slice_max(secondary_count, n = 1) %>%
ungroup()
#> # A tibble: 4 x 5
#> Group Gene Score direct_count secondary_count
#> <int> <chr> <dbl> <int> <int>
#> 1 1 AQP11 0.557 4 5
#> 2 2 CHST6 0.430 1 3
#> 3 3 ACE 0.634 1 1
#> 4 3 NOS2 0.634 1 1
library(data.table)
dt <- dt[dt[, .I[(max(Score) - Score) < 0.05], by = Group]$V1]
dt <- dt[dt[, .I[direct_count == max(direct_count)], by = Group]$V1]
dt <- dt[dt[, .I[secondary_count == max(secondary_count)], by = Group]$V1]
dt
#> Group Gene Score direct_count secondary_count
#> 1: 1 AQP11 0.5566507 4 5
#> 2: 2 CHST6 0.4295135 1 3
#> 3: 3 ACE 0.6340000 1 1
#> 4: 3 NOS2 0.6345000 1 1