R 排除特定行下面的所有记录
我的数据包含三个变量和三个唯一ID,每个变量都有多条记录。见下文R 排除特定行下面的所有记录,r,dataframe,R,Dataframe,我的数据包含三个变量和三个唯一ID,每个变量都有多条记录。见下文 ID <- c(rep(1,7), rep(2,6), rep(3,5), rep(4,6)) t <- c(seq(1,7), seq(1,6), seq(1,5), rep(2,6)) y <- c(rep(6,7), rep(1,6), rep(6,5), rep(0.2,6)) z <- c(5,0,0,0,1,0,0,0,0,-1,0,0,0,4,2,NaN,0,1,0,0,1,Inf,Inf,
ID <- c(rep(1,7), rep(2,6), rep(3,5), rep(4,6))
t <- c(seq(1,7), seq(1,6), seq(1,5), rep(2,6))
y <- c(rep(6,7), rep(1,6), rep(6,5), rep(0.2,6))
z <- c(5,0,0,0,1,0,0,0,0,-1,0,0,0,4,2,NaN,0,1,0,0,1,Inf,Inf, Inf)
dat1 <- data.frame(ID, t, y, z)
ID这适用于您的示例数据。可能不是最快的解决方案
mutate(dat1, rownumber = 1:nrow(dat1)) -> dat1
for(i in unique(dat1$ID)) {
firstMiss = min(filter(dat1, ID==i &
(z %in% c(NaN, Inf, -Inf) |
z < 0))$rownumber,
max(filter(dat1, ID==i)$rownumber)+1,
na.rm=TRUE)
dat1 <- filter(dat1, !(ID==i & rownumber >= firstMiss))
}
dat2 <- select(dat1, -rownumber)
mutate(dat1,rownumber=1:nrow(dat1))->dat1
对于(唯一的i(dat1$ID)){
firstMiss=min(过滤器(dat1,ID==i&
(z%在%c中(NaN,Inf,-Inf)|
z<0))$rownumber,
最大值(过滤器(dat1,ID==i)$rownumber)+1,
na.rm=TRUE)
dat1=首次未命中)
}
dat2如果您的ID
列按升序排序,则可以使用
dat2 <- dat1[unlist(tapply(dat1$z, dat1$ID, function(x)
cumsum(x<0 | x %in% c(NaN, -Inf, Inf))==0)),]
dat2
ID t y z
1 1 1 6.0 5
2 1 2 6.0 0
3 1 3 6.0 0
4 1 4 6.0 0
5 1 5 6.0 1
6 1 6 6.0 0
7 1 7 6.0 0
8 2 1 1.0 0
9 2 2 1.0 0
14 3 1 6.0 4
15 3 2 6.0 2
19 4 2 0.2 0
20 4 2 0.2 0
21 4 2 0.2 1
dat2ave
,作为一组由替代品组成。使用cumsum
计数器识别要删除的任何后续行:
dat1[with(dat1, ave(z < 0 | (!is.finite(z)), ID, FUN=cumsum) == 0),]
dat1[带(dat1,ave(z<0 |(!is.finite(z)),ID,FUN=cumsum)==0),]
快速检查以查看它们是否匹配,但行名未对齐的情况除外:
all.equal(
dat2,
dat1[with(dat1, ave(z < 0 | (!is.finite(z)), ID, FUN=cumsum) == 0),],
check.attributes=FALSE
)
#[1] TRUE
all.equal(
dat2,
dat1[with(dat1,ave(z<0 |(!is.finite(z)),ID,FUN=cumsum)==0),],
check.attributes=FALSE
)
#[1] 真的
仅适用于LAFF:
library(dplyr)
dat1 %>% group_by(ID) %>%
mutate(non_positive = min(which(lead(z,1) < 0 |
!is.finite(lead(z,1)) | row_number() == n()))) %>%
filter(row_number() <= non_positive) %>%
select(-non_positive)
# # A tibble: 14 x 4
# # Groups: ID [4]
# ID t y z
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 6.0 5
# 2 1 2 6.0 0
# 3 1 3 6.0 0
# 4 1 4 6.0 0
# 5 1 5 6.0 1
# 6 1 6 6.0 0
# 7 1 7 6.0 0
# 8 2 1 1.0 0
# 9 2 2 1.0 0
# 10 3 1 6.0 4
# 11 3 2 6.0 2
# 12 4 2 0.2 0
# 13 4 2 0.2 0
# 14 4 2 0.2 1
库(dplyr)
dat1%%>%分组依据(ID)%%>%
突变(非阳性=最小值)(其中(导联(z,1)<0)
!is.finite(前导(z,1))|行数()==n())%>%
过滤器(行号()%
选择(-非正)
##A tibble:14 x 4
##组:ID[4]
#ID t y z
#
# 1 1 1 6.0 5
# 2 1 2 6.0 0
# 3 1 3 6.0 0
# 4 1 4 6.0 0
# 5 1 5 6.0 1
# 6 1 6 6.0 0
# 7 1 7 6.0 0
# 8 2 1 1.0 0
# 9 2 2 1.0 0
# 10 3 1 6.0 4
# 11 3 2 6.0 2
# 12 4 2 0.2 0
# 13 4 2 0.2 0
# 14 4 2 0.2 1
这取决于ID是否按顺序排序。请针对dat1 Yes-tapply
尝试将它们视为因素并进行排序。我在上面添加了一条警告。谢谢。您使用ave
的解决方案更好。与cummin
基本相同的是dat1[as.logical(ave(dat1$z,dat1$ID,FUN=function(x))cummin(is.finite(x)&x>=0)),]
。可以通过更紧密地匹配参数来进行一些清理。
library(dplyr)
dat1 %>% group_by(ID) %>%
mutate(non_positive = min(which(lead(z,1) < 0 |
!is.finite(lead(z,1)) | row_number() == n()))) %>%
filter(row_number() <= non_positive) %>%
select(-non_positive)
# # A tibble: 14 x 4
# # Groups: ID [4]
# ID t y z
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 6.0 5
# 2 1 2 6.0 0
# 3 1 3 6.0 0
# 4 1 4 6.0 0
# 5 1 5 6.0 1
# 6 1 6 6.0 0
# 7 1 7 6.0 0
# 8 2 1 1.0 0
# 9 2 2 1.0 0
# 10 3 1 6.0 4
# 11 3 2 6.0 2
# 12 4 2 0.2 0
# 13 4 2 0.2 0
# 14 4 2 0.2 1