R 按组排除连续行上的非重叠范围
我有如下数据框:R 按组排除连续行上的非重叠范围,r,data.table,R,Data.table,我有如下数据框: df1 <- data.frame(Group = c("scaf1", "scaf1", "scaf1", "scaf2", "scaf2", "scaf2", "scaf3", "scaf3", "scaf4", "scaf4"), Start = c(10, 40, 90, 50, 80, 95, 600, 800, 70, 100), End = c(50, 70, 120, 70, 100,
df1 <- data.frame(Group = c("scaf1", "scaf1", "scaf1", "scaf2", "scaf2", "scaf2", "scaf3", "scaf3", "scaf4", "scaf4"),
Start = c(10, 40, 90, 50, 80, 95, 600, 800, 70, 100),
End = c(50, 70, 120, 70, 100, 150, 700, 850, 100, 145))
df1
# group start End
# scaf1 10 50
# scaf1 40 70
# scaf1 90 120
# scaf2 50 70
# scaf2 80 100
# scaf2 95 150
# scaf3 600 700
# scaf3 800 850
# scaf4 70 100
# scaf4 100 145
我尝试执行以下命令,但失败:
setDT(df1)[ , .((start[2] < End[1])[-.N], by = group]
setDT(df1)[,.((开始[2]<结束[1])[-.N],由=组]
提前谢谢。给你:
df1=data.frame(Group=c("scaf1","scaf1","scaf1","scaf2","scaf2","scaf2","scaf3","scaf3","scaf4","scaf4"),Start=c(10,40,90,50,80,95,600,800,70,100),End=c(50,70,120,70,100,150,700,850,100,145))
df1$filter = F
for(k in 2:nrow(df1)){
if(df1$Group[k]==df1$Group[k-1] && df1$Start[k]<=df1$End[k-1]){
df1$filter[k-1]=T
df1$filter[k]=T
}
}
df2 = df1[df1$filter==T,]
df2$filter = NULL
获取输出的另一种方法是使用GenomicRanges包定义的范围
library(GenomicRanges)
# create a GRanges object
df1_gr <- GRanges(df1$Group, IRanges(df1$Start, df1$End))
# find the overlaps
gr <- as.data.frame(findOverlaps(df1_gr))
# remove self-overlapping
gr <- gr[gr$queryHits != gr$subjectHits,]
# final dataset
df1[ gr$queryHits, ]
Group Start End
1: scaf1 10 50
2: scaf1 40 70
3: scaf2 80 100
4: scaf2 95 150
5: scaf4 70 100
6: scaf4 100 145
库(基因组范围)
#创建GRanges对象
df1_gr请检查您的预期输出。可能需要一些更正为什么scaf2上的50 70不在输出中,因为10 50第一行在那里正如akrun已经评论过的那样,您的输出似乎被破坏了。但是您可以尝试以下方法:unlist(lapply(split)(df1,df1$Group),function(x)c(TRUE,sapply(1:(length(x$Start)-1),function(y)x$Start[y+1]
这一行发生了什么事?scaf2 50 70
和scaf3
?可能df1%>%groupby(group)%%>%filter((Start>=lag(Start)&Start TrysetDT(df1)[df1[,.I[(Start>=shift(Start,fill=0)]&Start
> df2
Group Start End
1 scaf1 10 50
2 scaf1 40 70
5 scaf2 80 100
6 scaf2 95 150
9 scaf4 70 100
10 scaf4 100 145
library(GenomicRanges)
# create a GRanges object
df1_gr <- GRanges(df1$Group, IRanges(df1$Start, df1$End))
# find the overlaps
gr <- as.data.frame(findOverlaps(df1_gr))
# remove self-overlapping
gr <- gr[gr$queryHits != gr$subjectHits,]
# final dataset
df1[ gr$queryHits, ]
Group Start End
1: scaf1 10 50
2: scaf1 40 70
3: scaf2 80 100
4: scaf2 95 150
5: scaf4 70 100
6: scaf4 100 145