基于两列值的计算筛选数据表的优雅方法是什么?[R]
我有一个数据表,我们叫它基于两列值的计算筛选数据表的优雅方法是什么?[R],r,R,我有一个数据表,我们叫它lung: > lung variant_id transcript_id is_NL counts nrows 1: chr10_129450960_T_C_b38 chr10_129467297_129536240 0 33029 458 2: chr10_129450960_T_C_b38 chr10_129467297_129536240 1 34
lung
:
> lung
variant_id transcript_id is_NL counts nrows
1: chr10_129450960_T_C_b38 chr10_129467297_129536240 0 33029 458
2: chr10_129450960_T_C_b38 chr10_129467297_129536240 1 3477 54
3: chr10_129450960_T_C_b38 chr10_129467297_129536240 2 130 3
4: chr10_129450960_T_C_b38 chr10_129536378_129563778 0 51 458
5: chr10_129450960_T_C_b38 chr10_129536378_129563778 1 8 54
---
500148: chr9_34699703_G_C_b38 chr9_34649082_34649409 1 4214 57
500149: chr9_34699703_G_C_b38 chr9_34649082_34649409 2 171 2
500150: chr9_34699703_G_C_b38 chr9_34649565_34650368 0 48713 456
500151: chr9_34699703_G_C_b38 chr9_34649565_34650368 1 4932 57
500152: chr9_34699703_G_C_b38 chr9_34649565_34650368 2 208 2
我想过滤它,当is_NL==0
时,只保留计数/nrows<50
的行(50
为任意数),当is_NL
为1
或2
时,只保留计数/nrows>50
的行
到目前为止,我只能得出以下结论:
> lung[which(lung[is_NL == 0][,counts]/lung[is_NL == 0][,nrows] < 50),]
variant_id transcript_id is_NL counts nrows
1: chr10_129450960_T_C_b38 chr10_129467297_129536240 1 3477 54
2: chr10_129450960_T_C_b38 chr10_129536378_129563778 0 51 458
3: chr10_129450960_T_C_b38 chr10_129536378_129563778 1 8 54
4: chr10_129450960_T_C_b38 chr10_129536378_129707894 0 37918 458
5: chr10_129450960_T_C_b38 chr10_129701913_129707894 0 188 458
---
147877: chr17_45825156_G_A_b38 chr17_46148240_46152903 2 17 20
147878: chr17_45825156_G_A_b38 chr17_46152967_46156773 0 3 336
147879: chr17_45825156_G_A_b38 chr17_46152967_46169530 0 5 336
147880: chr17_45825156_G_A_b38 chr17_46152967_46169530 1 137 159
147881: chr17_45825156_G_A_b38 chr17_46156896_46170854 0 18 336
> lung[which(lung[is_NL > 0]$counts/lung[is_NL > 0]$nrows > 50),]
variant_id transcript_id is_NL counts nrows
1: chr10_129450960_T_C_b38 chr10_129467297_129536240 0 33029 458
2: chr10_129450960_T_C_b38 chr10_129536378_129563778 1 8 54
3: chr10_129450960_T_C_b38 chr10_129701913_129707894 1 24 54
4: chr10_129450960_T_C_b38 chr10_129701913_129707894 2 2 3
5: chr10_129450960_T_C_b38 chr10_129708044_129715519 2 0 3
---
50195: chr17_46025930_T_C_b38 chr17_46039885_46050532 0 14129 337
50196: chr17_46025930_T_C_b38 chr17_46050705_46066536 0 14106 337
50197: chr17_46025930_T_C_b38 chr17_46050705_46066536 1 6658 158
50198: chr17_46025930_T_C_b38 chr17_46050705_46066536 2 809 20
50199: chr17_46025930_T_C_b38 chr17_46066733_46067548 0 12842 337
>lung[which(lung[is_NL==0][,counts]/lung[is_NL==0][,nrows]<50],]
变体id转录本id为nrows计数
1:chr10_129450960_T_C_b38 chr10_129467297_129536240 1347754
2:chr10_129450960_T_C_b38 chr10_129536378_129563778 0 51 458
3:chr10_129450960_T_C_b38 chr10_129536378_129563778 1 8 54
4:chr10_129450960_T_C_b38 chr10_129536378_129707894 0 37918 458
5:chr10_129450960_T_C_b38 chr10_129701913_129707894 0 188 458
---
147877:chr17\U 45825156\U G\U A\U b38 chr17\U 46148240\U 46152903 2 17 20
147878:chr17_45825156_G_A_b38 chr17_46152967_46156773 0336
147879:chr17_45825156_G_A_b38 chr17_46152967_461695300 5 336
147880:chr17_45825156_G_A_b38 chr17_46152967_46169530 1 137 159
147881:chr17_45825156_G_A_b38 chr17_46156896_46170854 0 18 336
>肺[其中(肺[为0]$counts/肺[为0]$nrows>50),]
变体id转录本id为nrows计数
1:chr10_129450960_T_C_b38 chr10_129467297_129536240 0 33029458
2:chr10_129450960_T_C_b38 chr10_129536378_129563778 1 8 54
3:chr10_129450960_T_C_b38 chr10_129701913_129707894 1 24 54
4:chr10_129450960_T_C_b38 chr10_129701913_129707894 2 3
5:chr10_129450960_T_C_b38 chr10_129708044_129715519 2 0 3
---
50195:chr17_46025930_T_C_b38 chr17_46039885_46050532 0 14129 337
50196:chr17_46025930_T_C_b38 chr17_46050705_46066536 0 14106 337
50197:chr17_46025930_T_C_b38 chr17_46050705_46066536 1 6658 158
50198:chr17_46025930_T_C_b38 chr17_46050705_46066536 2 809 20
50199:chr17_46025930_T_C_b38 chr17_46066733_46067548 0 12842 337
通过查看
is\NL
列可以看出,这不起作用。我可能会先将它们子集到两个不同的表中,应用比较过滤器(
50
),然后找出如何合并它们,但我觉得应该有一种更简单的方法来实现这一点,我不知道。我会使用tidyverse
创建一个标志:
lung %>%
mutate(FLG = if_else(is_NL == 0 & counts/nrows < 50, 1
if_else(is_NL in (1,2) &counts/nrows >50, 1,0))) %>%
filter(FLG == 1)
lung%>%
变异(FLG=if=else(is=0&counts/nrows<50,1
如果其他(在(1,2)&计数/nrows>50,1,0))%>%
过滤器(FLG==1)
如果你想用base R,那么
lung[which((is_NL == 0 & counts/nrows < 50)|(is_NL in (1,2) &counts/nrows >50)),]
lung[which((is_NL==0&counts/nrows<50)|(is_NL in(1,2)&counts/nrows>50)),]
我将使用tidyverse
创建一个标志:
lung %>%
mutate(FLG = if_else(is_NL == 0 & counts/nrows < 50, 1
if_else(is_NL in (1,2) &counts/nrows >50, 1,0))) %>%
filter(FLG == 1)
lung%>%
变异(FLG=if=else(is=0&counts/nrows<50,1
如果其他(在(1,2)&计数/nrows>50,1,0))%>%
过滤器(FLG==1)
如果你想用base R,那么
lung[which((is_NL == 0 & counts/nrows < 50)|(is_NL in (1,2) &counts/nrows >50)),]
lung[which((is_NL==0&counts/nrows<50)|(is_NL in(1,2)&counts/nrows>50)),]
在base R中,您可以执行以下操作:
lung[with(lung, (is_NL == 0 & counts/nrows < 50) |
(is_NL %in% c(1,2) & counts/nrows > 50)),]
# output
variant_id transcript_id is_NL counts nrows
2 chr10_129450960_T_C_b38 chr10_129467297_129536240 1 3477 54
4 chr10_129450960_T_C_b38 chr10_129536378_129563778 0 51 458
在base R中,您可以执行以下操作:
lung[with(lung, (is_NL == 0 & counts/nrows < 50) |
(is_NL %in% c(1,2) & counts/nrows > 50)),]
# output
variant_id transcript_id is_NL counts nrows
2 chr10_129450960_T_C_b38 chr10_129467297_129536240 1 3477 54
4 chr10_129450960_T_C_b38 chr10_129536378_129563778 0 51 458
使用
数据表
library(data.table)
setDT(lung)[!is_NL & counts/.N < 50|(is_NL %in% c(1, 2) & counts/.N > 50)]
库(data.table)
setDT(lung)[!is_NL&counts/.N<50(is_NL%在%c(1,2)&counts/.N>50)]
使用数据表
library(data.table)
setDT(lung)[!is_NL & counts/.N < 50|(is_NL %in% c(1, 2) & counts/.N > 50)]
库(data.table)
setDT(lung)[!is_NL&counts/.N<50(is_NL%在%c(1,2)&counts/.N>50)]