R 仅保留数据框中与特定图案长度匹配的行
我有一个如下所示的数据框:R 仅保留数据框中与特定图案长度匹配的行,r,dplyr,R,Dplyr,我有一个如下所示的数据框: df <- structure(list(Sequence = c("Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Seq
df <- structure(list(Sequence = c("Sequence", "Sequence", "Sequence",
"Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence",
"Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence",
"Sequence", "Sequence", "Sequence", "Sequence", "Sequence", "Sequence",
"Sequence", "Sequence", "Sequence", "Sequence"), start = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25), end = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25),
score = c(-0.205, -0.229, -0.115, -0.427, -0.327, -0.543,
-0.717, -0.923, -1.241, -1.471, -1.737, -1.717, -1.247, -1.137,
-0.689, -0.731, -0.337, 0.091, 0.579, 0.93, 0.575, 0.128,
-0.036, -0.186, -0.259), residue = c("M", "D", "A", "R",
"M", "R", "E", "L", "S", "F", "K", "V", "V", "L", "L", "G",
"E", "G", "R", "V", "G", "K", "T", "S", "L"), epitope = c("E",
"E", "E", ".", ".", ".", ".", ".", "E", ".", ".", "E", "E",
".", ".", ".", ".", ".", "E", "E", "E", "E", "E", "E", "."
)), .Names = c("Sequence", "start", "end", "score", "residue",
"epitope"), class = c("data.table", "data.frame"), row.names = c(NA,
-25L))
我们可以使用
data.table
中的rleid
。将“data.frame”转换为“data.table”(setDT(df)
,按“表位”的游程长度id分组),创建一个逻辑向量(.N>=5&表位==“E”
),提取行索引(.I
)并对数据进行子集
library(data.table)
setDT(df)[df[, .I[.N >=5 & epitope == "E"], rleid(epitope)]$V1]
# Sequence start end score residue epitope
#1: Sequence 19 19 0.579 R E
#2: Sequence 20 20 0.930 V E
#3: Sequence 21 21 0.575 G E
#4: Sequence 22 22 0.128 K E
#5: Sequence 23 23 -0.036 T E
#6: Sequence 24 24 -0.186 S E
或者我们可以使用类似的方法,从
base R
df[inverse.rle(within.list(rl, values[!(lengths >=5 & values)] <- FALSE)),]
df[inverse.rle(在.list内(rl,值[!(长度>=5&值)]
df[inverse.rle(within.list(rl, values[!(lengths >=5 & values)] <- FALSE)),]