R 从数据集查询
我有两个数据集,如下所示R 从数据集查询,r,merge,dplyr,R,Merge,Dplyr,我有两个数据集,如下所示 dat1 <- read.table(header=TRUE, text=" ID log Dist ab7 1.1 2 ab8 1.6 1.5 ab21 3 1 ab3 2.05 1.09 ab300 1.5 0.45 ab4 1.78 1.11
dat1 <- read.table(header=TRUE, text="
ID log Dist
ab7 1.1 2
ab8 1.6 1.5
ab21 3 1
ab3 2.05 1.09
ab300 1.5 0.45
ab4 1.78 1.11
ab10 1.9 2
ab501 1.5 0.2
")
dat1
ID log Dist
1 ab7 1.10 2.00
2 ab8 1.60 1.50
3 ab21 3.00 1.00
4 ab3 2.05 1.09
5 ab300 1.50 0.45
6 ab4 1.78 1.11
7 ab10 1.90 2.00
8 ab501 1.50 0.20
dat2 <- read.table(header=TRUE, text="
ID LFrom LTo It1 It2 It3 It4
ab7 1 1.05 47 152 259 140
ab7 1.05 1.96 29 45 39 30
ab7 1.96 2.35 59 65 47 40
ab7 2.35 4.45 27 36 31 37
ab7 4.45 5 58 60 60 56
ab8 1.1 2.1 88 236 251 145
ab8 2.1 3.1 51 66 47 43
ab8 3.1 3.5 31 63 46 37
ab8 3.5 3.8 58 35 31 51
ab8 3.8 3.9 29 40 30 48
ab21 1.2 2.1 72 263 331 147
ab3 1 2 71 207 290 242
ab3 2 3 22 38 64 46
ab3 3 4 35 35 43 61
ab3 4 4.5 42 37 33 53
ab300 1 2 54 65 51 67
ab4 1.2 2.1 67 38 54 24
ab4 2.1 2.3 67 30 20 50
ab4 2.3 9.1 67 27 34 39
ab10 1.1 2 64 56 21 34
ab501 1 2 47 152 259 140
")
dat2
ID LFrom LTo It1 It2 It3 It4
1 ab7 1.00 1.05 47 152 259 140
2 ab7 1.05 1.96 29 45 39 30
3 ab7 1.96 2.35 59 65 47 40
4 ab7 2.35 4.45 27 36 31 37
5 ab7 4.45 5.00 58 60 60 56
6 ab8 1.10 2.10 88 236 251 145
7 ab8 2.10 3.10 51 66 47 43
8 ab8 3.10 3.50 31 63 46 37
9 ab8 3.50 3.80 58 35 31 51
10 ab8 3.80 3.90 29 40 30 48
11 ab21 1.20 2.10 72 263 331 147
12 ab3 1.00 2.00 71 207 290 242
13 ab3 2.00 3.00 22 38 64 46
14 ab3 3.00 4.00 35 35 43 61
15 ab3 4.00 4.50 42 37 33 53
16 ab300 1.00 2.00 54 65 51 67
17 ab4 1.20 2.10 67 38 54 24
18 ab4 2.10 2.30 67 30 20 50
19 ab4 2.30 9.10 67 27 34 39
20 ab10 1.10 2.00 64 56 21 34
21 ab501 1.00 2.00 47 152 259 140
以下代码工作:
dat3 <- data.frame()
for(i in 1:nrow(dat1)){
d <- dat1[i,]
# filter dat2 with ID
d2 <- dat2[dat2$ID == d$ID,]
# filter dat2 with range interference
r1 <- c(d$log, d$log + d$Dist)
d2 <- d2[apply(d2[,2:3], 1, function(x){r1[2] > x[1] & x[2] > r1[1]}),]
# filter dat2 with range and collect data to dat3
if(nrow(d2) > 0){
r2 <- range(d2[,2:3])
if(r1[1] >= r2[1] & r1[2] <= r2[2])
dat3 <- rbind(dat3, data.frame(ID=d$ID, log=d$log, Dist=d$Dist, d2[,-1]))
}
}
rownames(dat3) <- 1:nrow(dat3)
print(dat3)
dat3Dplyr解决方案:
dat2 %>%
group_by(ID) %>%
mutate(log = dat1$log[dat1$ID == unique(ID)]) %>%
mutate(Dist = dat1$Dist[dat1$ID == unique(ID)]) %>%
mutate(LFromMin = min(LFrom)) %>%
mutate(LToMax = max(LTo)) %>%
mutate(upper = log+Dist) %>%
filter(log > LFromMin & upper<LToMax) %>%
filter(LFrom >= LFrom[which(LFrom-log== max((LFrom-log)[LFrom-log < 0]))]) %>%
filter(LTo <= LTo[which(LTo-upper == min((LTo - upper)[LTo-upper>0]))]) %>%
select(c(ID, log, Dist, LFrom, LTo, It1, It2, It3, It4))
dat2%>%
分组依据(ID)%>%
变异(log=dat1$log[dat1$ID==unique(ID)])%>%
变异(Dist=dat1$Dist[dat1$ID==unique(ID)])%>%
突变(LFromMin=min(LFrom))%>%
变异(LToMax=max(LTo))%>%
变异(上限=对数+距离)%>%
过滤器(对数>最小值和上限%
过滤器(LFrom>=LFrom[其中(LFrom log==max((LFrom log)[LFrom log<0]))]%>%
过滤器(LTo 0]))]]%>%
选择(c(ID、日志、距离、LFrom、LTo、It1、It2、It3、It4))
dat2 %>%
group_by(ID) %>%
mutate(log = dat1$log[dat1$ID == unique(ID)]) %>%
mutate(Dist = dat1$Dist[dat1$ID == unique(ID)]) %>%
mutate(LFromMin = min(LFrom)) %>%
mutate(LToMax = max(LTo)) %>%
mutate(upper = log+Dist) %>%
filter(log > LFromMin & upper<LToMax) %>%
filter(LFrom >= LFrom[which(LFrom-log== max((LFrom-log)[LFrom-log < 0]))]) %>%
filter(LTo <= LTo[which(LTo-upper == min((LTo - upper)[LTo-upper>0]))]) %>%
select(c(ID, log, Dist, LFrom, LTo, It1, It2, It3, It4))