R 基于现有时间间隔创建所有可能时间间隔的表

R 基于现有时间间隔创建所有可能时间间隔的表,r,R,目前有两个(大量)表具有“开始”和“结束”日期。我想合并这两个表,这样我就有了所有可能的“从”和“到”日期集,可以从原始日期形成。例如,如果int1==0:6,int2==3:9,那么我需要三个间隔:0:2,3:6,7:9 我尝试过使用foverlaps并手动创建所有可能的日期间隔,然后将数据合并到该表中。下面的代码显示了这些失败的玩具数据尝试。下面的预期输出应该清楚地表明我想要完成什么 现有的表非常庞大(数百万个id,每个id都有多组日期) 我正在尝试第三种方法。。。创建一个空表,每个id每行

目前有两个(大量)表具有“开始”和“结束”日期。我想合并这两个表,这样我就有了所有可能的“从”和“到”日期集,可以从原始日期形成。例如,如果int1==0:6,int2==3:9,那么我需要三个间隔:0:2,3:6,7:9

我尝试过使用foverlaps并手动创建所有可能的日期间隔,然后将数据合并到该表中。下面的代码显示了这些失败的玩具数据尝试。下面的预期输出应该清楚地表明我想要完成什么

现有的表非常庞大(数百万个id,每个id都有多组日期)

我正在尝试第三种方法。。。创建一个空表,每个id每行有1天(作为“到”和“从”日期)。这个方法的问题是,考虑到我需要覆盖的ID数量和年份,它的速度非常慢。已经快20个小时了,我的基表还在创建中。之后,计划是使用foverlaps在现有表上合并

我对这个问题感到非常恼火,如果能得到任何帮助,我将不胜感激

# load packages
library(data.table)
library(lubridate)
# create data
dt1<- data.table(id = rep(1111, 4),
           from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
           to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
           progs = c("a1", "b1", "c1", "d1"))
setkey(dt1, id, from_date, to_date)    

dt2<- data.table(id = rep(1111, 4),
           from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
           to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
           progs = c("a2", "b2", "c2", "d2"))
setkey(dt2, id, from_date, to_date)    

# expected (hoped for) output
id  from_date   to_date progs1  prog2
1111    1/1/2016    1/31/2016   a1  NA
1111    2/1/2016    2/28/2016   a1  a2
1111    2/29/2016   3/15/2016   a1  NA
1111    3/31/2016   3/31/2016   b1  NA
1111    4/1/2016    9/1/2016    b1  b2
1111    9/2/2016    9/2/2016    c1  b2
1111    9/3/2016    9/30/2016   d1  b2
1111    10/1/2016   10/31/2016  NA  d1
1111    11/1/2016   11/30/2016  d1  c2
1111    12/1/2016   12/15/2016  d1  NA
1111    12/16/2016  12/31/2016  NA  d2

# failed attempt #1: using foverlaps
overlaps <- foverlaps(x=dt1, y=dt2, 
                by.x = c("id", "from_date", "to_date"),
                by.y = c("id", "from_date", "to_date"), 
                type = "any", 
                mult ="all")
# this does not give every time interval    

# failed attempt #2... super convoluted method
# try to make every possible time interval ----
dt <- rbind(dt1[, .(id, from_date)], dt2[, .(id, from_date)]) 
dt.temp <- rbind(dt1[, .(id, to_date)], dt2[, .(id, to_date)]) # get table with to_dates
setnames(dt.temp, "to_date", "from_date") 
dt <- rbind(dt, dt.temp)
rm(dt.temp)
dt <- unique(dt)
setorder(dt, -from_date)
dt[, to_date := as.Date(c(NA, from_date[-.N]), origin = "1970-01-01"), by = "id"]
setorder(dt, from_date)
dt <- dt[!is.na(to_date)] # the last 'from_date' is actually the final to_date, so it doesn't begin a time interval
dt[, counter := 1:.N, by = id] # create indicator so we can know which interval is the first interval for each id
dt[counter != 1, from_date := as.integer(from_date + 1)] # to prevent overlap with previous interval
dt[, counter := NULL]
setkey(dt, id, from_date, to_date)    

# merge on dt1 ----
dt <- foverlaps(dt, dt1, type = "any", mult = "all")
dt[, from_date := i.from_date] # when dt1 didn't match, the from_date is NA. fill with i.from_date
dt[, to_date := i.to_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, c("i.from_date", "i.to_date") := NULL] # no longer needed
setkey(dt, id, from_date, to_date)    

# merge on dt2 ----
dt <- foverlaps(dt, dt2, type = "any", mult = "all")
dt[, from_date := i.from_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, to_date := i.to_date] # when dt2 didn't match, the from_date is NA. fill with i.from_date
dt[, c("i.from_date", "i.to_date") := NULL] # no longer needed
setkey(dt, id, from_date, to_date)    

setnames(dt, c("i.progs", "progs"), c("progs1", "progs2"))    

# Collapse data if dates are contiguous and data are the same ----
# Create unique ID for data chunks ----
dt[, group := .GRP, by = c("id", "progs1", "progs2")] # create group id
dt[, group := cumsum( c(0, diff(group)!=0) )] # in situation like a:a:a:b:b:b:b:a:a:a, want to distinguish first set of "a" from second set of "a"    

# Create unique ID for contiguous times within a given data chunk ----
setkey(dt, id, from_date)
dt[, prev_to_date := c(NA, to_date[-.N]), by = "group"]
dt[, diff.prev := from_date - prev_to_date] # difference between from_date & prev_to_date will be 1 (day) if they are contiguous
dt[diff.prev != 1, diff.prev := NA] # set to NA if difference is not 1 day, i.e., it is not contiguous, i.e., it starts a new contiguous chunk
dt[is.na(diff.prev), contig.id := .I] # Give a unique number for each start of a new contiguous chunk (i.e., section starts with NA)
setkey(dt, group, from_date) # need to order the data so the following line will work.
dt[, contig.id  := contig.id[1], by=  .( group , cumsum(!is.na(contig.id))) ] # fill forward by group
dt[, c("prev_to_date", "diff.prev") := NULL] # drop columns that were just intermediates    

# Collapse rows where data chunks are constant and time is contiguous ----      
dt[, from_date := min(from_date), by = c("group", "contig.id")]
dt[, to_date := max(to_date), by = c("group", "contig.id")]
dt[, c("group", "contig.id") := NULL]
dt <- unique(dt)      

# the end result is incorrect table
id  from_date   to_date progs2  progs1
1111    1/1/2016    2/28/2016   a2  a1
1111    2/29/2016   3/15/2016   NA  a1
1111    3/16/2016   3/31/2016   NA  b1
1111    4/1/2016    9/1/2016    b2  b1
1111    9/2/2016    9/2/2016    b2  c1
1111    9/3/2016    9/30/2016   b2  d1
1111    10/1/2016   11/30/2016  c2  d1
1111    12/1/2016   12/15/2016  d2  d1
1111    12/16/2016  12/31/2016  d2  NA
#加载包
库(数据表)
图书馆(lubridate)
#创建数据

dt1不是100%确定您要做什么,但是,有一个名为crossing的函数,它可以让您在多个向量之间进行所有排列


> library(tidyr)
> a <- c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")
> b <- c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")
> c <- rep(1111, 4)
> crossing(a, b,c)

# A tibble: 16 x 3
   a          b              c
   <chr>      <chr>      <dbl>
 1 2016-01-01 2016-03-15  1111
 2 2016-01-01 2016-09-01  1111
 3 2016-01-01 2016-09-02  1111
 4 2016-01-01 2016-12-15  1111
 5 2016-03-31 2016-03-15  1111
 6 2016-03-31 2016-09-01  1111
 7 2016-03-31 2016-09-02  1111
 8 2016-03-31 2016-12-15  1111
 9 2016-09-02 2016-03-15  1111
10 2016-09-02 2016-09-01  1111
11 2016-09-02 2016-09-02  1111
12 2016-09-02 2016-12-15  1111
13 2016-09-03 2016-03-15  1111
14 2016-09-03 2016-09-01  1111
15 2016-09-03 2016-09-02  1111
16 2016-09-03 2016-12-15  1111

>图书馆(tidyr)
>a、b、c交叉口(a、b、c)
#一个tibble:16 x 3
a、b、c
1 2016-01-01 2016-03-15  1111
2 2016-01-01 2016-09-01  1111
3 2016-01-01 2016-09-02  1111
4 2016-01-01 2016-12-15  1111
5 2016-03-31 2016-03-15  1111
6 2016-03-31 2016-09-01  1111
7 2016-03-31 2016-09-02  1111
8 2016-03-31 2016-12-15  1111
9 2016-09-02 2016-03-15  1111
10 2016-09-02 2016-09-01  1111
11 2016-09-02 2016-09-02  1111
12 2016-09-02 2016-12-15  1111
13 2016-09-03 2016-03-15  1111
14 2016-09-03 2016-09-01  1111
15 2016-09-03 2016-09-02  1111
16 2016-09-03 2016-12-15  1111

如果你想达到的目标是这样的话,这会不会是你想要达到的目标?

我想我理解你的意思,试试这个——从R开始:

library("data.table")
dt1<- data.table(id = rep(1111, 4),
                 from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
                 to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
                 progs1 = c("a1", "b1", "c1", "d1"))

dt2 <- data.table(id = rep(1111, 4),
                 from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
                 to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
                 progs2 = c("a2", "b2", "c2", "d2"))

# Full outer join: 

dt3 <- merge(dt1, dt2, by = intersect(colnames(dt1), colnames(dt2)), all = TRUE)
库(“data.table”)

dt1它并不漂亮,但这里有一个混合的tidyverse/data.table解决方案。它将事物分解为多个组件:

  • dt1和dt2之间所有可能数据组合的完全连接 (凭身份证)
  • 确定每行上出现的重叠类型(有7个排列)并设置重叠日期(\u o后缀)
  • 重叠类型需要不同数量的行来映射合并的开始和结束日期。展开数据框以提供每个重叠类型所需的行数
  • 根据重叠类型创建组合日期(\u c后缀)
  • 确定组合日期范围适用于哪个数据集(enroll_type=dt1、dt2或两者),然后从单个源(dt1/dt2)中删除完全由“两者”的enroll_类型覆盖的行
  • 由于前面是按ID+日期排序的,因此可以使用提前/延迟来截断组合日期,这样就不会有任何日期包含在多个startdate_c-enddate_c span中
  • 您可能会找到方法使其更加优雅和高效

          library(data.table)
        library(tidyr)
    
        #create test data ----
        dt1<- data.table(id = rep(1111, 4),
                         from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
                         to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
                         progs = c("a1", "b1", "c1", "d1"))
        setkey(dt1, id, from_date, to_date)    
    
        dt2<- data.table(id = rep(1111, 4),
                         from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
                         to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
                         progs = c("a2", "b2", "c2", "d2"))
        setkey(dt2, id, from_date, to_date)    
    
        # create all possible matches between time segments ----
        dt <- setDT(mutate(dt1) %>% full_join(., dt2, by = c("id")) )
        #dt[, c("progs.y", "progs.x") := NULL]
        #setnames(dt, names(dt), c("id", "startdate_dt1", "enddate_dt1", "startdate_dt2", "enddate_dt2"))
        setnames(dt, names(dt), c("id", "startdate_dt1", "enddate_dt1", "progs1", "startdate_dt2", "enddate_dt2", "progs2"))
    
        # set up intervals ----
        temp <- dt %>%
          mutate(overlap_type = case_when(
            # First ID the non-matches
            is.na(startdate_dt1) | is.na(startdate_dt2) ~ 0,
            # Then figure out which overlapping date comes first
            # Exactly the same dates
            startdate_dt1 == startdate_dt2 & enddate_dt1 == enddate_dt2 ~ 1,
            # dt1 before dt2 (or exactly the same dates)
            startdate_dt1 <= startdate_dt2 & startdate_dt2 <= enddate_dt1 & 
              enddate_dt1 <= enddate_dt2 ~ 2,
            # dt2 before dt1
            startdate_dt2 <= startdate_dt1 & startdate_dt1 <= enddate_dt2 & 
              enddate_dt2 <= enddate_dt1 ~ 3,
            # dt2 dates competely within dt1 dates or vice versa
            startdate_dt2 >= startdate_dt1 & enddate_dt2 <= enddate_dt1 ~ 4,
            startdate_dt1 >= startdate_dt2 & enddate_dt1 <= enddate_dt2 ~ 5,
            # dt1 coverage only before dt2 (or dt2 only after dt1)
            startdate_dt1 < startdate_dt2 & enddate_dt1 < startdate_dt2 ~ 6,
            # dt1 coverage only after dt2 (or dt2 only before dt1)
            startdate_dt1 > enddate_dt2 & enddate_dt1 > enddate_dt2 ~ 7,
            # Any rows that are left
            TRUE ~ 8),
            # Calculate overlapping dates
            startdate_o = as.Date(case_when(
              overlap_type %in% c(1, 2, 4) ~ startdate_dt2,
              overlap_type %in% c(3, 5) ~ startdate_dt1), origin = "1970-01-01"),
            enddate_o = as.Date(ifelse(overlap_type %in% c(1:5),
                                       pmin(enddate_dt2, enddate_dt1),
                                       NA), origin = "1970-01-01"),
            # Need to duplicate rows to separate out non-overlapping dt1 and dt2 periods
            repnum = case_when(
              overlap_type %in% c(2:5) ~ 3,
              overlap_type %in% c(6:7) ~ 2,
              TRUE ~ 1)
          ) %>%
          select(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, 
                 startdate_o, enddate_o, overlap_type, repnum) %>%
          arrange(id, startdate_dt1, startdate_dt2, startdate_o, 
                  enddate_dt1, enddate_dt2, enddate_o)
    
    
        ### Expand out rows to separate out overlaps ----
        temp_ext <- temp[rep(seq(nrow(temp)), temp$repnum), 1:ncol(temp)]
    
        ## process expanded ----
        temp_ext <- temp_ext %>% 
          group_by(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2) %>% 
          mutate(rownum_temp = row_number()) %>%
          ungroup() %>%
          arrange(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, startdate_o, 
                  enddate_o, overlap_type, rownum_temp) %>%
          mutate(
            # Remove non-overlapping dates
            startdate_dt1 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 2) | 
                                           (overlap_type == 7 & rownum_temp == 1), 
                                         NA, startdate_dt1), origin = "1970-01-01"), 
            enddate_dt1 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 2) | 
                                         (overlap_type == 7 & rownum_temp == 1), 
                                       NA, enddate_dt1), origin = "1970-01-01"),
            startdate_dt2 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 1) | 
                                           (overlap_type == 7 & rownum_temp == 2), 
                                         NA, startdate_dt2), origin = "1970-01-01"), 
            enddate_dt2 = as.Date(ifelse((overlap_type == 6 & rownum_temp == 1) | 
                                         (overlap_type == 7 & rownum_temp == 2), 
                                       NA, enddate_dt2), origin = "1970-01-01")) %>%
          distinct(id, startdate_dt1, enddate_dt1, startdate_dt2, enddate_dt2, startdate_o, 
                   enddate_o, overlap_type, rownum_temp, .keep_all = TRUE) %>%
          # Remove first row if start dates are the same or dt1 is only one day
          filter(!(overlap_type %in% c(2:5) & rownum_temp == 1 & 
                     (startdate_dt1 == startdate_dt2 | startdate_dt1 == enddate_dt1))) %>%
          # Remove third row if enddates are the same
          filter(!(overlap_type %in% c(2:5) & rownum_temp == 3 & enddate_dt1 == enddate_dt2))
    
        ##  Calculate the finalized date columms----
        ### Calculate finalized date columns
        temp_ext <- temp_ext %>%
          # Set up combined dates
          mutate(
            # Start with rows with only dt1 or dt2, or when both sets of dates are identical
            startdate_c = as.Date(
              case_when(
                (!is.na(startdate_dt1) & is.na(startdate_dt2)) | overlap_type == 1 ~ startdate_dt1,
                !is.na(startdate_dt2) & is.na(startdate_dt1) ~ startdate_dt2), origin = "1970-01-01"),
            enddate_c = as.Date(
              case_when(
                (!is.na(enddate_dt1) & is.na(enddate_dt2)) | overlap_type == 1 ~ enddate_dt1,
                !is.na(enddate_dt2) & is.na(enddate_dt1) ~ enddate_dt2), origin = "1970-01-01"),
            # Now look at overlapping rows and rows completely contained within the other data's dates
            startdate_c = as.Date(
              case_when(
                overlap_type %in% c(2, 4) & rownum_temp == 1 ~ startdate_dt1,
                overlap_type %in% c(3, 5) & rownum_temp == 1 ~ startdate_dt2,
                overlap_type %in% c(2:5) & rownum_temp == 2 ~ startdate_o,
                overlap_type %in% c(2:5) & rownum_temp == 3 ~ enddate_o + 1,
                TRUE ~ startdate_c), origin = "1970-01-01"),
            enddate_c = as.Date(
              case_when(
                overlap_type %in% c(2:5) & rownum_temp == 1 ~ lead(startdate_o, 1) - 1,
                overlap_type %in% c(2:5) & rownum_temp == 2 ~ enddate_o,
                overlap_type %in% c(2, 5) & rownum_temp == 3 ~ enddate_dt2,
                overlap_type %in% c(3, 4) & rownum_temp == 3 ~ enddate_dt1,
                TRUE ~ enddate_c), origin = "1970-01-01"),
            # Deal with the last line for each person if it's part of an overlap
            startdate_c = as.Date(ifelse((id != lead(id, 1) | is.na(lead(id, 1))) &
                                           overlap_type %in% c(2:5) & 
                                           enddate_dt1 != enddate_dt2, 
                                         lag(enddate_o, 1) + 1, 
                                         startdate_c), origin = "1970-01-01"),
            enddate_c = as.Date(ifelse((id != lead(id, 1) | is.na(lead(id, 1))) &
                                         overlap_type %in% c(2:5), 
                                       pmax(enddate_dt1, enddate_dt2, na.rm = TRUE), 
                                       enddate_c), origin = "1970-01-01")
          ) %>%
          arrange(id, startdate_c, enddate_c, startdate_dt1, startdate_dt2, 
                  enddate_dt1, enddate_dt2, overlap_type) %>%
          mutate(
            # Identify which type of enrollment this row represents
            enroll_type = 
              case_when(
                (overlap_type == 2 & rownum_temp == 1) | 
                  (overlap_type == 3 & rownum_temp == 3) |
                  (overlap_type == 6 & rownum_temp == 1) | 
                  (overlap_type == 7 & rownum_temp == 2) |
                  (overlap_type == 4 & rownum_temp %in% c(1, 3)) |
                  (overlap_type == 0 & is.na(startdate_dt2)) ~ "dt1",
                (overlap_type == 3 & rownum_temp == 1) | 
                  (overlap_type == 2 & rownum_temp == 3) |
                  (overlap_type == 6 & rownum_temp == 2) | 
                  (overlap_type == 7 & rownum_temp == 1) | 
                  (overlap_type == 5 & rownum_temp %in% c(1, 3)) |
                  (overlap_type == 0 & is.na(startdate_dt1)) ~ "dt2",
                overlap_type == 1 | (overlap_type %in% c(2:5) & rownum_temp == 2) ~ "both",
                TRUE ~ "x"
              ),
            # Drop rows from enroll_type == h/m when they are fully covered by an enroll_type == b
            drop = 
              case_when(
                id == lag(id, 1) & !is.na(lag(id, 1)) & 
                  startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
                  enddate_c >= lag(enddate_c, 1) & !is.na(lag(enddate_c, 1)) & 
                  # Fix up quirk from dt1 data where two rows present for the same day
                  !(lag(enroll_type, 1) != "dt2" & lag(enddate_dt1, 1) == lag(startdate_dt1, 1)) &
                  enroll_type != "both" ~ 1,
                id == lead(id, 1) & !is.na(lead(id, 1)) & 
                  startdate_c == lead(startdate_c, 1) & !is.na(lead(startdate_c, 1)) &
                  enddate_c <= lead(enddate_c, 1) & !is.na(lead(enddate_c, 1)) & 
                  # Fix up quirk from dt1 data where two rows present for the same day
                  !(lead(enroll_type, 1) != "dt2" & lead(enddate_dt1, 1) == lead(startdate_dt1, 1)) &
                  enroll_type != "both" & lead(enroll_type, 1) == "both" ~ 1,
                # Fix up other oddities when the date range is only one day
                id == lag(id, 1) & !is.na(lag(id, 1)) & 
                  startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
                  startdate_c == enddate_c & !is.na(startdate_c) & 
                  ((enroll_type == "dt2" & lag(enroll_type, 1) %in% c("both", "dt1")) |
                     (enroll_type == "dt1" & lag(enroll_type, 1) %in% c("both", "dt2"))) ~ 1,
                id == lag(id, 1) & !is.na(lag(id, 1)) & 
                  startdate_c == lag(startdate_c, 1) & !is.na(lag(startdate_c, 1)) &
                  startdate_c == enddate_c & !is.na(startdate_c) &
                  startdate_dt1 == lag(startdate_dt1, 1) & enddate_dt1 == lag(enddate_dt1, 1) &
                  !is.na(startdate_dt1) & !is.na(lag(startdate_dt1, 1)) &
                  enroll_type != "both" ~ 1,
                id == lead(id, 1) & !is.na(lead(id, 1)) & 
                  startdate_c == lead(startdate_c, 1) & !is.na(lead(startdate_c, 1)) &
                  startdate_c == enddate_c & !is.na(startdate_c) &
                  ((enroll_type == "dt2" & lead(enroll_type, 1) %in% c("both", "dt1")) |
                     (enroll_type == "dt1" & lead(enroll_type, 1) %in% c("both", "dt2"))) ~ 1,
                # Drop rows where the enddate_c < startdate_c due to 
                # both data sources' dates ending at the same time
                enddate_c < startdate_c ~ 1,
                TRUE ~ 0
              )
          ) %>%
          filter(drop == 0 | is.na(drop)) %>%
          # Truncate remaining overlapping end dates
          mutate(enddate_c = as.Date(
            ifelse(id == lead(id, 1) & !is.na(lead(startdate_c, 1)) &
                     startdate_c < lead(startdate_c, 1) &
                     enddate_c >= lead(enddate_c, 1),
                   lead(startdate_c, 1) - 1,
                   enddate_c),
            origin = "1970-01-01")
          ) %>%
          select(-drop, -repnum, -rownum_temp) %>%
          # With rows truncated, now additional rows with enroll_type == h/m that 
          # are fully covered by an enroll_type == b
          # Also catches single day rows that now have enddate < startdate
          mutate(
            drop = case_when(
              id == lag(id, 1) & startdate_c == lag(startdate_c, 1) &
                enddate_c == lag(enddate_c, 1) & lag(enroll_type, 1) == "both" & 
                enroll_type != "both" ~ 1,
              id == lead(id, 1) & startdate_c == lead(startdate_c, 1) &
                enddate_c <= lead(enddate_c, 1) & lead(enroll_type, 1) == "both" ~ 1,
              id == lag(id, 1) & startdate_c >= lag(startdate_c, 1) &
                enddate_c <= lag(enddate_c, 1) & enroll_type != "both" &
                lag(enroll_type, 1) == "both" ~ 1,
              id == lead(id, 1) & startdate_c >= lead(startdate_c, 1) &
                enddate_c <= lead(enddate_c, 1) & enroll_type != "both" &
                lead(enroll_type, 1) == "both" ~ 1,
              TRUE ~ 0)
          ) %>%
          filter(drop == 0 | is.na(drop)) %>%
          select(id, startdate_c, enddate_c, enroll_type)
    
    库(data.table)
    图书馆(tidyr)
    #创建测试数据----
    dt1%
    不同(id,起始日期dt1,结束日期dt1,起始日期dt2,结束日期dt2,起始日期o,
    enddate\u o,重叠类型,rownum\u temp,.keep\u all=TRUE)%>%
    #如果开始日期相同或dt1仅为一天,则删除第一行
    过滤器(!(重叠类型%c中的%c(2:5)&rownum\u temp==1&
    (startdate_dt1==startdate_dt2 | startdate_dt1==enddate_dt1)))%>%
    #如果结束日期相同,请删除第三行
    过滤器(!(重叠类型%in%c(2:5)&rownum\u temp==3&enddate\u dt1==enddate\u dt2))
    ##计算最终确定的日期----
    ###计算最终确定的日期列
    温度外部%
    #设置组合日期
    变异(
    #从只有dt1或dt2的行开始,或者当两组日期相同时开始
    开始日期=截止日期(
    什么时候(
    (!is.na(startdate_dt1)和is.na(startdate_dt2))|重叠类型==1~startdate_dt1,
    !is.na(startdate_dt2)&is.na(startdate_dt1)~startdate_dt2),origin=“1970-01-01”),
    enddate_c=as.Date(
    什么时候(
    (!is.na(enddate_dt1)和is.na(enddate_dt2))|重叠类型==1~enddate_dt1,
    !is.na(enddate\u dt2)&is.na(enddate\u dt1)~enddate\u dt2),origin=“1970-01-01”),
    #现在看看重叠的行和完全包含在其他数据日期中的行
    开始日期=截止日期(
    什么时候(
    %c(2,4)中的重叠类型%&rownum\u temp==1~起始日期\u dt1,
    %c(3,5)中的重叠类型%&rownum\u temp==1~起始日期\u dt2,
    重叠类型%c(2:5)&rownum\u temp==2~开始日期,
    %c(2:5)中的重叠类型%s和rownum\u temp==3~enddate\u o+1,
    正确~startdate_c),origin=“1970-01-01”),
    enddate_c=as.Date(
    什么时候(
    重叠类型%c(2:5)&rownum\u temp==1~导程(起始日期,1)-1,
    %c(2:5)中的重叠类型%s&rownum\u temp==2~enddate\u o,
    %c(2,5)中的重叠类型%s&rownum\u temp==3~enddate\u dt2,
    %c(3,4)中的重叠类型%&rownum\u temp==3~enddate\u dt1,
    TRUE~enddate_c),origin=“1970-01-01”),
    #处理每个人的最后一行,如果它是重叠的一部分
    startdate_c=截止日期(如果其他((id!=lead(id,1)|是.na(lead(id,1)))&
    重叠类型%c(2:5)和
    enddate\u dt1!=enddate\u dt2,
    滞后(结束日期,1)+1,
    起始日期c),origin=“1970-01-01”),
    
    # load packages ----
      library(data.table)    
    
    # create data ----
      rm(list=ls())
      dt1<- data.table(id = rep(1111, 4),
                         from_date = as.Date(c("2016-01-01", "2016-03-31","2016-09-02", "2016-09-03")), 
                         to_date = as.Date(c("2016-03-15", "2016-09-01", "2016-09-02", "2016-12-15")), 
                         progs = c("a1", "b1", "c1", "d1"))
      setkey(dt1, id, from_date, to_date)    
    
      dt2<- data.table(id = rep(1111, 4),
                         from_date = as.Date(c("2016-02-01", "2016-04-01","2016-11-01", "2016-12-01")), 
                         to_date = as.Date(c("2016-02-28", "2016-09-30", "2016-11-30", "2016-12-31")), 
                         progs = c("a2", "b2", "c2", "d2"))
      setkey(dt2, id, from_date, to_date)        
    
    
    
    # Create table with 'intervals' of 1 day duration ----
      dt <- rbind(dt1[,1:3], dt2[,1:3])
      dt[, reps := (to_date - from_date) + 1] # identify the number of days per interval (add one because dates are inclusive)
      dt <- dt[rep(1:.N,reps)] # replicate each row to make 1 row per day of each interval
      dt[,counter:=(1:.N-1),by=c("id", "from_date")] # add a counter (aka index number) for each from date per id
      dt[, c("from_date", "to_date") := from_date + counter] # create intervals of 1 day
      dt[, c("reps", "counter") := NULL] # drop columns no longer needed
      dt <- unique(dt) # de-duplicate rows so each day only appears once
      setkey(dt, id, from_date)    
    
    # merge on dt1 ----
        dt <- foverlaps(x=dt, y=dt1, 
                        by.x = c("id", "from_date", "to_date"), 
                        by.y = c("id", "from_date", "to_date"), 
                        type = "any", mult = "all")
        dt <- dt[, c("from_date", "to_date") := NULL] # drop intervals from dt1 because will use the intervals from dt for merging on dt2 below
        setnames(dt, c("i.from_date", "i.to_date"), c("from_date", "to_date") )
        setcolorder(dt, c("id", "from_date", "to_date"))
        setkey(dt, id, from_date, to_date)    
    
    # merge on dt2 ----
        dt <- foverlaps(x=dt, y=dt2, 
                        by.x = c("id", "from_date", "to_date"), 
                        by.y = c("id", "from_date", "to_date"), 
                        type = "any", mult = "all")
        dt <- dt[, c("from_date", "to_date") := NULL] # drop intervals from dt2 because will use the intervals from dt for merging on dt2 below
        setnames(dt, c("i.from_date", "i.to_date"), c("from_date", "to_date") )
        setcolorder(dt, c("id", "from_date", "to_date"))
        setkey(dt, id, from_date, to_date)       
    
    # Collapse data if dates are contiguous and data are the same ----
        # Create unique ID for data chunks ----
        setnames(dt, c("i.progs", "progs"), c("progs1", "progs2"))
        dt[, group := .GRP, by = c("id", "progs1", "progs2")] # create group id
        dt[, group := cumsum( c(0, diff(group)!=0) )] # in situation like a:a:a:b:b:b:b:a:a:a, want to distinguish first set of "a" from second set of "a"    
    
        # Create unique ID for contiguous times within a given data chunk ----
        setkey(dt, id, from_date)
        dt[, prev_to_date := c(NA, to_date[-.N]), by = "group"]
        dt[, diff.prev := from_date - prev_to_date] # difference between from_date & prev_to_date will be 1 (day) if they are contiguous
        dt[diff.prev != 1, diff.prev := NA] # set to NA if difference is not 1 day, i.e., it is not contiguous, i.e., it starts a new contiguous chunk
        dt[is.na(diff.prev), contig.id := .I] # Give a unique number for each start of a new contiguous chunk (i.e., section starts with NA)
        setkey(dt, group, from_date) # need to order the data so the following line will work.
        dt[, contig.id  := contig.id[1], by=  .( group , cumsum(!is.na(contig.id))) ] # fill forward by group
        dt[, c("prev_to_date", "diff.prev") := NULL] # drop columns that were just intermediates    
    
        # Collapse rows where data chunks are constant and time is contiguous ----      
        dt[, from_date := min(from_date), by = c("group", "contig.id")]
        dt[, to_date := max(to_date), by = c("group", "contig.id")]
        dt[, c("group", "contig.id") := NULL]
        dt <- unique(dt)