data.table R:在特定时间段内发生两个事件的瞬间

data.table R:在特定时间段内发生两个事件的瞬间,r,join,data.table,R,Join,Data.table,我需要确定在特定时间段内发生两个事件的实例,如下所示。如果先发生事件a,则事件B必须在24小时内发生。另一方面,如果B首先出现,则需要在72小时内找到A。此外,当满足标准时,我需要“开始”时间,即第一个事件发生的时间 事件A structure(list(fake_id = c("1000686267", "1000686267", "1000686267", "1000686267", "1000686267", "1000686267", "1000686267", "1070640921

我需要确定在特定时间段内发生两个事件的实例,如下所示。如果先发生事件a,则事件B必须在24小时内发生。另一方面,如果B首先出现,则需要在72小时内找到A。此外,当满足标准时,我需要“开始”时间,即第一个事件发生的时间

事件A

structure(list(fake_id = c("1000686267", "1000686267", "1000686267", 
"1000686267", "1000686267", "1000686267", "1000686267", "1070640921", 
"1070640921", "1070640921", "1070640921", "1070640921", "1070640921", 
"1184695414", "1184695414", "1184695414", "1184695414", "1184695414"
), date = structure(c(1515063600, 1514822400, 1514822400, 1514822400, 
1514822400, 1515146400, 1514901600, 1515330000, 1514822400, 1514822400, 
1514822400, 1514822400, 1517385600, 1516701600, 1515142800, 1515178800, 
1515178800, 1516557600), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA, 
-18L), class = "data.frame", .Names = c("fake_id", 
"date"))
事件B

structure(list(fake_id = c("1000686267", "1000686267", "1000686267", 
"1000686267", "1000686267", "1000686267", "1000686267", "1000686267", 
"1000686267", "1000686267", "1000686267", "1000686267", "1000686267", 
"1000686267", "1000686267", "1000686267", "1000686267", "1070640921", 
"1070640921", "1070640921", "1070640921", "1070640921", "1070640921", 
"1184695414", "1184695414", "1184695414", "1184695414", "1184695414", 
"1184695414", "1184695414"), date = structure(c(1516795200, 1516795200, 
1516795200, 1516917600, 1517400000, 1517400000, 1515492000, 1515492000, 
1516190400, 1516190400, 1517410800, 1517410800, 1516921200, 1515070800, 
1515070800, 1515052800, 1516633200, 1517374800, 1515322800, 1515322800, 
1516525200, 1515232800, 1516543200, 1516550400, 1515189600, 1516543200, 
1516543200, 1515142800, 1515142800, 1515142800), class = c("POSIXct", 
"POSIXt"), tzone = "UTC")), row.names = c(NA, -30L), class = "data.frame", .Names = c("fake_id", 
"date"))
一些代码


 library (data.table)

 event_a <- data.table(event_a[, c("fake_id", "date"), with = FALSE])
 event_b <- data.table(event_b[, c("fake_id", "date"), with = FALSE])

 event_a[, `:=`("criteria_a", "criteria_a")]
 event_b[, `:=`("criteria_b", "criteria_b")]

 setkeyv(event_a, c("fake_id", "date"))
 setkeyv(event_b, c("fake_id", "date"))

 join_window <- 60 * 60 * c(24, 72)

 event_subset_a <- event_a[event_b, roll = join_window[1]]
 event_subset_b <- event_b[event_a, roll = join_window[2]]

 event_df <- rbind(event_subset_a, event_subset_b)
 event_df[, `:=`(c("criteria_a", "criteria_b"),  NULL)]

 setkeyv(event_df, c("fake_id", "date"))
 event_df <- unique(event_df)

所需输出

      fake_id                date
1  1184695414 2018-01-05 09:00:00
2  1184695414 2018-01-05 19:00:00
3  1184695414 2018-01-05 22:00:00
4  1184695414 2018-01-21 14:00:00
5  1184695414 2018-01-21 16:00:00
6  1184695414 2018-01-21 18:00:00
7  1184695414 2018-01-23 10:00:00
      fake_id                date
1  1184695414 2018-01-05 09:00:00
2  1184695414 2018-01-21 14:00:00
3  1184695414 2018-01-23 10:00:00

起初我认为这个问题需要用一个非相等联接来解决,但后来我意识到一个标准联接就足够了

整个过程如下:

  • 消除重复的行
  • 连接两个表
  • 过滤条件A首先出现的条件。将其标记为“A型”,并确定发病时间
  • 过滤条件B最先出现的情况。将其标记为“B型”,并确定发病时间
  • 删除未标记的行
  • 修改列的名称,以便更容易理解哪些列属于何处 消除重复的行
    eventA这类似于@PavoDive,但重点是在实际连接之前创建非等连接条件:

    library (data.table)
    setDT(event_a)
    setDT(event_b)
    
    # for the join - eventB needs to be within -72 to 24 hours
    event_a[, `:=`(min_date = date - 72*60*60,
                   max_date = date + 24*60*60)]
    
    # join unique data.tables
    unique(event_b)[unique(event_a),
               #non-equi join conditions
                    on = .(fake_id = fake_id,
                           date > min_date,
                           date < max_date),
                    nomatch = 0L,
                    allow.cartesian = T,
                #select columns - you would only include fake_id and onset for desired output
                    j = .(fake_id,
                          a_date = i.date,
                          b_date = x.date,
                          onset = pmin(i.date, x.date),
                          first_type = ifelse(i.date == x.date,
                                              NA_character_,
                                              ifelse(i.date < x.date,
                                                     'A',
                                                     'B'))
                          )
                    ]
    
           fake_id              a_date              b_date               onset first_type
     1: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00 2018-01-04 11:00:00          A
     2: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00          B
     3: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00 2018-01-04 13:00:00          B
     4: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00          B
     5: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00 2018-01-07 11:00:00          B
     6: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00 2018-01-06 10:00:00          B
     7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00 2018-01-31 05:00:00          B
     8: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00          B
     9: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00          B
    10: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00 2018-01-05 09:00:00          A
    11: 1184695414 2018-01-05 09:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00       <NA>
    12: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00 2018-01-05 19:00:00          A
    13: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00          B
    14: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00          B
    15: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00          B
    
    库(data.table)
    setDT(事件a)
    setDT(事件b)
    #对于加入-eventB需要在-72到24小时内完成
    事件a[,`:=`(最小日期=日期-72*60*60,
    最大日期=日期+24*60*60]
    #联接唯一数据表
    唯一(事件_b)[唯一(事件_a),
    #非等连接条件
    on=(假身份证=假身份证,
    日期>最小日期,
    日期<最长日期),
    nomatch=0升,
    allow.cartesian=T,
    #选择列-您将只包含所需输出的伪id和起始值
    j=(假身份证,
    a_date=i.date,
    b_date=x.date,
    起效时间=pmin(i.date,x.date),
    first_type=ifelse(i.date==x.date,
    NA_character_,
    如果其他(i.日期

    输出的不同之处在于第11行中的开始时间相同。我的联接条件没有捕获到这一点,因为
    data.table
    当前不支持不相等。

    为什么要将
    data.table
    dplyr
    语法混合在一起?如果您能显示预期的输出,那将非常有用。我现在不在电脑旁,但最近我对一个类似的问题给出了答案。您可能想看看它,看看是否可以应用一些函数,特别是
    data.table::foverlaps
    和非equi连接@毛里塔尼亚,对不起!作为dplyr的忠实粉丝,这是一种习惯力量。@RonakShah,希望这能有所帮助。谢谢
    setnames(eventA, "date", "dateA")
    setnames(eventB, "date", "dateB")
    
    eventA <- eventA[!duplicated(eventA), ]
    eventB <- eventB[!duplicated(eventB), ]
    
    eventA[eventB, 
           allow.cartesian = TRUE][
              dateA < dateB & dateB <= dateA + dhours(24), 
              `:=` (type = "A", 
                    onset = dateA)][
                        dateB < dateA & dateA <= dateB + dhours(72), 
                        `:=` (type = "B", 
                              onset = dateB)][!is.na(type), ][]
    
           fake_id               dateA               dateB type               onset
     1: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00    B 2018-01-04 08:00:00
     2: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00    B 2018-01-04 08:00:00
     3: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00    A 2018-01-04 11:00:00
     4: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00    B 2018-01-04 13:00:00
     5: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00    B 2018-01-06 10:00:00
     6: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00    B 2018-01-07 11:00:00
     7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00    B 2018-01-31 05:00:00
     8: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00    B 2018-01-05 09:00:00
     9: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00    A 2018-01-05 09:00:00
    10: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00    A 2018-01-05 19:00:00
    11: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00    B 2018-01-21 14:00:00
    12: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00    B 2018-01-21 14:00:00
    13: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00    B 2018-01-21 16:00:00
    14: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00    B 2018-01-21 16:00:00
    
    library (data.table)
    setDT(event_a)
    setDT(event_b)
    
    # for the join - eventB needs to be within -72 to 24 hours
    event_a[, `:=`(min_date = date - 72*60*60,
                   max_date = date + 24*60*60)]
    
    # join unique data.tables
    unique(event_b)[unique(event_a),
               #non-equi join conditions
                    on = .(fake_id = fake_id,
                           date > min_date,
                           date < max_date),
                    nomatch = 0L,
                    allow.cartesian = T,
                #select columns - you would only include fake_id and onset for desired output
                    j = .(fake_id,
                          a_date = i.date,
                          b_date = x.date,
                          onset = pmin(i.date, x.date),
                          first_type = ifelse(i.date == x.date,
                                              NA_character_,
                                              ifelse(i.date < x.date,
                                                     'A',
                                                     'B'))
                          )
                    ]
    
           fake_id              a_date              b_date               onset first_type
     1: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00 2018-01-04 11:00:00          A
     2: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00          B
     3: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00 2018-01-04 13:00:00          B
     4: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00          B
     5: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00 2018-01-07 11:00:00          B
     6: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00 2018-01-06 10:00:00          B
     7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00 2018-01-31 05:00:00          B
     8: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00          B
     9: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00          B
    10: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00 2018-01-05 09:00:00          A
    11: 1184695414 2018-01-05 09:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00       <NA>
    12: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00 2018-01-05 19:00:00          A
    13: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00          B
    14: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00          B
    15: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00          B