Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/76.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
R 如何在不比较每个值的情况下有条件地选择每个组的最高值?_R_If Statement_Data.table - Fatal编程技术网

R 如何在不比较每个值的情况下有条件地选择每个组的最高值?

R 如何在不比较每个值的情况下有条件地选择每个组的最高值?,r,if-statement,data.table,R,If Statement,Data.table,我掌握的数据如下: Group Gene Score direct_count secondary_count 1 AQP11 0.5566507 4 5 1 CLNS1A 0.2811747 0 2 1 RSF1 0.5469924 3 6 2 CFDP1 0.4186066

我掌握的数据如下:

  Group Gene      Score     direct_count   secondary_count 
    1   AQP11    0.5566507       4               5
    1   CLNS1A   0.2811747       0               2
    1   RSF1     0.5469924       3               6
    2   CFDP1    0.4186066       1               2
    2   CHST6    0.4295135       1               3
    3   ACE      0.634           1               1
    3   NOS2     0.6345          1               1
    4   Gene1    0.7             0               1
    4   Gene2    0.61            1               0
    4   Gene3    0.62            0               1          
我通过
列对基因进行分组,然后根据条件选择每组最佳基因:

  • 如果得分最高的基因与组中所有其他基因之间的得分差异大于0.05,则选择得分最高的基因


  • 如果一组中顶端基因和任何其他基因之间的得分差异,则可以使用两种不同的解决方案实现最终目标:使用
    dplyr
    数据。表

    您不需要任何复杂的
    ifelse
    条件

    解决方案 输入

    dt%
    分组依据(分组)%>%
    过滤器(最大(分数)-分数%
    切片最大值(直接计数,n=1)%>%
    切片最大值(二次计数,n=1)%>%
    解组()
    #>#A tibble:4 x 5
    #>组基因得分直接计数二次计数
    #>                        
    #>1 AQP110.557 4 5
    #>2 CHST6 0.430 1 3
    #>3 ACE 0.634 1 1
    #>4 NOS2 0.634 1 1
    
    数据表

    library(data.table)
    
    dt <- dt[dt[, .I[(max(Score) - Score) < 0.05], by = Group]$V1]
    dt <- dt[dt[, .I[direct_count == max(direct_count)], by = Group]$V1]
    dt <- dt[dt[, .I[secondary_count == max(secondary_count)], by = Group]$V1]
    dt
    #>    Group  Gene     Score direct_count secondary_count
    #> 1:     1 AQP11 0.5566507            4               5
    #> 2:     2 CHST6 0.4295135            1               3
    #> 3:     3   ACE 0.6340000            1               1
    #> 4:     3  NOS2 0.6345000            1               1
    
    库(data.table)
    dt 2:2 CHST6 0.4295135 1 3
    #>3:3 ACE 0.63400001
    #>4:3 NOS2 0.6345000 1
    

    你的编辑 与问题末尾的具体问题相关:这两种方法根据您编写的规则选择CHST6

    dt%
    分组依据(分组)%>%
    过滤器(最大(分数)-分数%
    切片最大值(直接计数,n=1)%>%
    切片最大值(二次计数,n=1)%>%
    解组()
    #>#A tible:1 x 5
    #>组基因得分直接计数二次计数
    #>                        
    #>1 2 CHST6 0.599 1 6
    ##########数据表
    库(数据表)
    dt
    
    df<- setDT(df)
    new_df <- df[, 
       {d = dist(Score, method = 'manhattan')
       if (any(d > 0.05)) 
         ind = which.max(d)
       else if (sum(max(direct_count) == direct_count) == 1L) 
         ind = which.max(direct_count)
       else if (sum(max(secondary_count) == secondary_count) == 1L) 
         ind = which.max(secondary_count)
       else 
         ind = which((outer(direct_count, direct_count, '==') & outer(secondary_count, secondary_count, '=='))[1, ])
       
       .SD[ind]
       }
       , by = Group]
    
    
    structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L), Gene = c("AQP11", 
    "CLNS1A", "RSF1", "CFDP1", "CHST6", "ACE", "NOS2", "Gene1","Gene2","Gene3"), Score = c(0.5566507, 
    0.2811747, 0.5269924, 0.4186066, 0.4295135, 0.634, 0.6345, 0.7, 0.62, 0.61), direct_count = c(4L, 
    0L, 3L, 1L, 1L, 1L, 1L, 0L, 1L, 0L), secondary_count = c(5L, 2L, 6L, 2L, 
    3L, 1L, 1L, 0L, 0L, 1L)), row.names = c(NA, -10L), class = c("data.table", 
    "data.frame"))
    
      Group Gene         Score      direct_count     secondary_count
    1   2    CFDP1        0.5517401        1                  62
    2   2    CHST6        0.5989186        1                   6
    3   2    RNU6-758P    0.5644914        0                   1
    4   2    Gene1        0.5672916        0                   1
    5   2    TMEM170A     0.6167083        0                   2
    
    structure(list(Group = c(2L, 2L, 2L, 2L, 2L), Gene = c("CFDP1", 
    "CHST6", "RNU6-758P", "Gene1", "TMEM170A"), Score = c(0.551740109920502, 
    0.598918557167053, 0.564491391181946, 0.567291617393494, 0.616708278656006
    ), direct_count = c(1, 1, 0, 0, 0), secondary_count = c(62, 
    6, 1, 1, 2)), row.names = c(NA, -5L), class = c("data.table", 
    "data.frame"))
    
    dt <- structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L), Gene = c("AQP11", 
                                                                               "CLNS1A", "RSF1", "CFDP1", "CHST6", "ACE", "NOS2"), Score = c(0.5566507, 
                                                                                                                                             0.2811747, 0.5269924, 0.4186066, 0.4295135, 0.634, 0.6345), direct_count = c(4L, 
                                                                                                                                                                                                                          0L, 3L, 1L, 1L, 1L, 1L), secondary_count = c(5L, 2L, 6L, 2L, 
                                                                                                                                                                                                                                                                       3L, 1L, 1L)), row.names = c(NA, -7L), class = c("data.table", 
                                                                                                                                                                                                                                                                                                                       "data.frame"))
    
    library(dplyr)
    
    dt %>% 
      group_by(Group) %>% 
      filter((max(Score) - Score)<0.05) %>% 
      slice_max(direct_count, n = 1) %>% 
      slice_max(secondary_count, n = 1) %>% 
      ungroup()
    #> # A tibble: 4 x 5
    #>   Group Gene  Score direct_count secondary_count
    #>   <int> <chr> <dbl>        <int>           <int>
    #> 1     1 AQP11 0.557            4               5
    #> 2     2 CHST6 0.430            1               3
    #> 3     3 ACE   0.634            1               1
    #> 4     3 NOS2  0.634            1               1
    
    library(data.table)
    
    dt <- dt[dt[, .I[(max(Score) - Score) < 0.05], by = Group]$V1]
    dt <- dt[dt[, .I[direct_count == max(direct_count)], by = Group]$V1]
    dt <- dt[dt[, .I[secondary_count == max(secondary_count)], by = Group]$V1]
    dt
    #>    Group  Gene     Score direct_count secondary_count
    #> 1:     1 AQP11 0.5566507            4               5
    #> 2:     2 CHST6 0.4295135            1               3
    #> 3:     3   ACE 0.6340000            1               1
    #> 4:     3  NOS2 0.6345000            1               1