R 从数据集查询

R 从数据集查询,r,merge,dplyr,R,Merge,Dplyr,我有两个数据集,如下所示 dat1 <- read.table(header=TRUE, text=" ID log Dist ab7 1.1 2 ab8 1.6 1.5 ab21 3 1 ab3 2.05 1.09 ab300 1.5 0.45 ab4 1.78 1.11

我有两个数据集,如下所示

dat1 <- read.table(header=TRUE, text="
ID  log Dist
ab7 1.1 2
                   ab8  1.6 1.5
                   ab21 3   1
                   ab3  2.05    1.09
                   ab300    1.5 0.45
                   ab4  1.78    1.11
                   ab10 1.9 2
                   ab501    1.5 0.2
                  ")

dat1
     ID  log Dist
1   ab7 1.10 2.00
2   ab8 1.60 1.50
3  ab21 3.00 1.00
4   ab3 2.05 1.09
5 ab300 1.50 0.45
6   ab4 1.78 1.11
7  ab10 1.90 2.00
8 ab501 1.50 0.20


dat2 <- read.table(header=TRUE, text="
ID  LFrom   LTo It1 It2 It3 It4
ab7 1   1.05    47  152 259 140
                  ab7   1.05    1.96    29  45  39  30
                  ab7   1.96    2.35    59  65  47  40
                  ab7   2.35    4.45    27  36  31  37
                  ab7   4.45    5   58  60  60  56
                  ab8   1.1 2.1 88  236 251 145
                  ab8   2.1 3.1 51  66  47  43
                  ab8   3.1 3.5 31  63  46  37
                  ab8   3.5 3.8 58  35  31  51
                  ab8   3.8 3.9 29  40  30  48
                  ab21  1.2 2.1 72  263 331 147
                  ab3   1   2   71  207 290 242
                  ab3   2   3   22  38  64  46
                  ab3   3   4   35  35  43  61
                  ab3   4   4.5 42  37  33  53
                  ab300 1   2   54  65  51  67
                  ab4   1.2 2.1 67  38  54  24
                  ab4   2.1 2.3 67  30  20  50
                  ab4   2.3 9.1 67  27  34  39
                  ab10  1.1 2   64  56  21  34
                  ab501 1   2   47  152 259 140
                  ")

dat2

      ID LFrom  LTo It1 It2 It3 It4
1    ab7  1.00 1.05  47 152 259 140
2    ab7  1.05 1.96  29  45  39  30
3    ab7  1.96 2.35  59  65  47  40
4    ab7  2.35 4.45  27  36  31  37
5    ab7  4.45 5.00  58  60  60  56
6    ab8  1.10 2.10  88 236 251 145
7    ab8  2.10 3.10  51  66  47  43
8    ab8  3.10 3.50  31  63  46  37
9    ab8  3.50 3.80  58  35  31  51
10   ab8  3.80 3.90  29  40  30  48
11  ab21  1.20 2.10  72 263 331 147
12   ab3  1.00 2.00  71 207 290 242
13   ab3  2.00 3.00  22  38  64  46
14   ab3  3.00 4.00  35  35  43  61
15   ab3  4.00 4.50  42  37  33  53
16 ab300  1.00 2.00  54  65  51  67
17   ab4  1.20 2.10  67  38  54  24
18   ab4  2.10 2.30  67  30  20  50
19   ab4  2.30 9.10  67  27  34  39
20  ab10  1.10 2.00  64  56  21  34
21 ab501  1.00 2.00  47 152 259 140
以下代码工作:

dat3 <- data.frame()

for(i in 1:nrow(dat1)){

  d <- dat1[i,]

  # filter dat2 with ID
  d2 <- dat2[dat2$ID == d$ID,]

  # filter dat2 with range interference
  r1 <- c(d$log, d$log + d$Dist)
  d2 <- d2[apply(d2[,2:3], 1, function(x){r1[2] > x[1] & x[2] > r1[1]}),]

  # filter dat2 with range and collect data to dat3
  if(nrow(d2) > 0){
    r2 <- range(d2[,2:3])
    if(r1[1] >= r2[1] & r1[2] <= r2[2])
      dat3 <- rbind(dat3, data.frame(ID=d$ID, log=d$log, Dist=d$Dist, d2[,-1]))
  }
}

rownames(dat3) <- 1:nrow(dat3)
print(dat3)
dat3Dplyr解决方案:

dat2 %>%
  group_by(ID) %>%
  mutate(log = dat1$log[dat1$ID == unique(ID)]) %>%
  mutate(Dist = dat1$Dist[dat1$ID == unique(ID)]) %>%
  mutate(LFromMin = min(LFrom)) %>%
  mutate(LToMax = max(LTo)) %>%
  mutate(upper = log+Dist) %>%
  filter(log > LFromMin & upper<LToMax) %>%
  filter(LFrom >= LFrom[which(LFrom-log== max((LFrom-log)[LFrom-log < 0]))]) %>%
  filter(LTo <= LTo[which(LTo-upper == min((LTo - upper)[LTo-upper>0]))]) %>%
  select(c(ID, log, Dist, LFrom, LTo, It1, It2, It3, It4))
dat2%>%
分组依据(ID)%>%
变异(log=dat1$log[dat1$ID==unique(ID)])%>%
变异(Dist=dat1$Dist[dat1$ID==unique(ID)])%>%
突变(LFromMin=min(LFrom))%>%
变异(LToMax=max(LTo))%>%
变异(上限=对数+距离)%>%
过滤器(对数>最小值和上限%
过滤器(LFrom>=LFrom[其中(LFrom log==max((LFrom log)[LFrom log<0]))]%>%
过滤器(LTo 0]))]]%>%
选择(c(ID、日志、距离、LFrom、LTo、It1、It2、It3、It4))
dat2 %>%
  group_by(ID) %>%
  mutate(log = dat1$log[dat1$ID == unique(ID)]) %>%
  mutate(Dist = dat1$Dist[dat1$ID == unique(ID)]) %>%
  mutate(LFromMin = min(LFrom)) %>%
  mutate(LToMax = max(LTo)) %>%
  mutate(upper = log+Dist) %>%
  filter(log > LFromMin & upper<LToMax) %>%
  filter(LFrom >= LFrom[which(LFrom-log== max((LFrom-log)[LFrom-log < 0]))]) %>%
  filter(LTo <= LTo[which(LTo-upper == min((LTo - upper)[LTo-upper>0]))]) %>%
  select(c(ID, log, Dist, LFrom, LTo, It1, It2, It3, It4))