data.table R:在特定时间段内发生两个事件的瞬间
我需要确定在特定时间段内发生两个事件的实例,如下所示。如果先发生事件a,则事件B必须在24小时内发生。另一方面,如果B首先出现,则需要在72小时内找到A。此外,当满足标准时,我需要“开始”时间,即第一个事件发生的时间 事件Adata.table R:在特定时间段内发生两个事件的瞬间,r,join,data.table,R,Join,Data.table,我需要确定在特定时间段内发生两个事件的实例,如下所示。如果先发生事件a,则事件B必须在24小时内发生。另一方面,如果B首先出现,则需要在72小时内找到A。此外,当满足标准时,我需要“开始”时间,即第一个事件发生的时间 事件A structure(list(fake_id = c("1000686267", "1000686267", "1000686267", "1000686267", "1000686267", "1000686267", "1000686267", "1070640921
structure(list(fake_id = c("1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1070640921",
"1070640921", "1070640921", "1070640921", "1070640921", "1070640921",
"1184695414", "1184695414", "1184695414", "1184695414", "1184695414"
), date = structure(c(1515063600, 1514822400, 1514822400, 1514822400,
1514822400, 1515146400, 1514901600, 1515330000, 1514822400, 1514822400,
1514822400, 1514822400, 1517385600, 1516701600, 1515142800, 1515178800,
1515178800, 1516557600), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-18L), class = "data.frame", .Names = c("fake_id",
"date"))
事件B
structure(list(fake_id = c("1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1000686267",
"1000686267", "1000686267", "1000686267", "1000686267", "1070640921",
"1070640921", "1070640921", "1070640921", "1070640921", "1070640921",
"1184695414", "1184695414", "1184695414", "1184695414", "1184695414",
"1184695414", "1184695414"), date = structure(c(1516795200, 1516795200,
1516795200, 1516917600, 1517400000, 1517400000, 1515492000, 1515492000,
1516190400, 1516190400, 1517410800, 1517410800, 1516921200, 1515070800,
1515070800, 1515052800, 1516633200, 1517374800, 1515322800, 1515322800,
1516525200, 1515232800, 1516543200, 1516550400, 1515189600, 1516543200,
1516543200, 1515142800, 1515142800, 1515142800), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -30L), class = "data.frame", .Names = c("fake_id",
"date"))
一些代码
library (data.table)
event_a <- data.table(event_a[, c("fake_id", "date"), with = FALSE])
event_b <- data.table(event_b[, c("fake_id", "date"), with = FALSE])
event_a[, `:=`("criteria_a", "criteria_a")]
event_b[, `:=`("criteria_b", "criteria_b")]
setkeyv(event_a, c("fake_id", "date"))
setkeyv(event_b, c("fake_id", "date"))
join_window <- 60 * 60 * c(24, 72)
event_subset_a <- event_a[event_b, roll = join_window[1]]
event_subset_b <- event_b[event_a, roll = join_window[2]]
event_df <- rbind(event_subset_a, event_subset_b)
event_df[, `:=`(c("criteria_a", "criteria_b"), NULL)]
setkeyv(event_df, c("fake_id", "date"))
event_df <- unique(event_df)
所需输出
fake_id date
1 1184695414 2018-01-05 09:00:00
2 1184695414 2018-01-05 19:00:00
3 1184695414 2018-01-05 22:00:00
4 1184695414 2018-01-21 14:00:00
5 1184695414 2018-01-21 16:00:00
6 1184695414 2018-01-21 18:00:00
7 1184695414 2018-01-23 10:00:00
fake_id date
1 1184695414 2018-01-05 09:00:00
2 1184695414 2018-01-21 14:00:00
3 1184695414 2018-01-23 10:00:00
起初我认为这个问题需要用一个非相等联接来解决,但后来我意识到一个标准联接就足够了 整个过程如下:
eventA这类似于@PavoDive,但重点是在实际连接之前创建非等连接条件:
library (data.table)
setDT(event_a)
setDT(event_b)
# for the join - eventB needs to be within -72 to 24 hours
event_a[, `:=`(min_date = date - 72*60*60,
max_date = date + 24*60*60)]
# join unique data.tables
unique(event_b)[unique(event_a),
#non-equi join conditions
on = .(fake_id = fake_id,
date > min_date,
date < max_date),
nomatch = 0L,
allow.cartesian = T,
#select columns - you would only include fake_id and onset for desired output
j = .(fake_id,
a_date = i.date,
b_date = x.date,
onset = pmin(i.date, x.date),
first_type = ifelse(i.date == x.date,
NA_character_,
ifelse(i.date < x.date,
'A',
'B'))
)
]
fake_id a_date b_date onset first_type
1: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00 2018-01-04 11:00:00 A
2: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00 B
3: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00 2018-01-04 13:00:00 B
4: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00 B
5: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00 2018-01-07 11:00:00 B
6: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00 2018-01-06 10:00:00 B
7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00 2018-01-31 05:00:00 B
8: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00 B
9: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00 B
10: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00 2018-01-05 09:00:00 A
11: 1184695414 2018-01-05 09:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00 <NA>
12: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00 2018-01-05 19:00:00 A
13: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00 B
14: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00 B
15: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00 B
库(data.table)
setDT(事件a)
setDT(事件b)
#对于加入-eventB需要在-72到24小时内完成
事件a[,`:=`(最小日期=日期-72*60*60,
最大日期=日期+24*60*60]
#联接唯一数据表
唯一(事件_b)[唯一(事件_a),
#非等连接条件
on=(假身份证=假身份证,
日期>最小日期,
日期<最长日期),
nomatch=0升,
allow.cartesian=T,
#选择列-您将只包含所需输出的伪id和起始值
j=(假身份证,
a_date=i.date,
b_date=x.date,
起效时间=pmin(i.date,x.date),
first_type=ifelse(i.date==x.date,
NA_character_,
如果其他(i.日期
输出的不同之处在于第11行中的开始时间相同。我的联接条件没有捕获到这一点,因为data.table
当前不支持不相等。为什么要将data.table
和dplyr
语法混合在一起?如果您能显示预期的输出,那将非常有用。我现在不在电脑旁,但最近我对一个类似的问题给出了答案。您可能想看看它,看看是否可以应用一些函数,特别是data.table::foverlaps
和非equi连接@毛里塔尼亚,对不起!作为dplyr的忠实粉丝,这是一种习惯力量。@RonakShah,希望这能有所帮助。谢谢
setnames(eventA, "date", "dateA")
setnames(eventB, "date", "dateB")
eventA <- eventA[!duplicated(eventA), ]
eventB <- eventB[!duplicated(eventB), ]
eventA[eventB,
allow.cartesian = TRUE][
dateA < dateB & dateB <= dateA + dhours(24),
`:=` (type = "A",
onset = dateA)][
dateB < dateA & dateA <= dateB + dhours(72),
`:=` (type = "B",
onset = dateB)][!is.na(type), ][]
fake_id dateA dateB type onset
1: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00 B 2018-01-04 08:00:00
2: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00 B 2018-01-04 08:00:00
3: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00 A 2018-01-04 11:00:00
4: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00 B 2018-01-04 13:00:00
5: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00 B 2018-01-06 10:00:00
6: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00 B 2018-01-07 11:00:00
7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00 B 2018-01-31 05:00:00
8: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00 B 2018-01-05 09:00:00
9: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00 A 2018-01-05 09:00:00
10: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00 A 2018-01-05 19:00:00
11: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00 B 2018-01-21 14:00:00
12: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00 B 2018-01-21 14:00:00
13: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00 B 2018-01-21 16:00:00
14: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00 B 2018-01-21 16:00:00
library (data.table)
setDT(event_a)
setDT(event_b)
# for the join - eventB needs to be within -72 to 24 hours
event_a[, `:=`(min_date = date - 72*60*60,
max_date = date + 24*60*60)]
# join unique data.tables
unique(event_b)[unique(event_a),
#non-equi join conditions
on = .(fake_id = fake_id,
date > min_date,
date < max_date),
nomatch = 0L,
allow.cartesian = T,
#select columns - you would only include fake_id and onset for desired output
j = .(fake_id,
a_date = i.date,
b_date = x.date,
onset = pmin(i.date, x.date),
first_type = ifelse(i.date == x.date,
NA_character_,
ifelse(i.date < x.date,
'A',
'B'))
)
]
fake_id a_date b_date onset first_type
1: 1000686267 2018-01-04 11:00:00 2018-01-04 13:00:00 2018-01-04 11:00:00 A
2: 1000686267 2018-01-04 11:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00 B
3: 1000686267 2018-01-05 10:00:00 2018-01-04 13:00:00 2018-01-04 13:00:00 B
4: 1000686267 2018-01-05 10:00:00 2018-01-04 08:00:00 2018-01-04 08:00:00 B
5: 1070640921 2018-01-07 13:00:00 2018-01-07 11:00:00 2018-01-07 11:00:00 B
6: 1070640921 2018-01-07 13:00:00 2018-01-06 10:00:00 2018-01-06 10:00:00 B
7: 1070640921 2018-01-31 08:00:00 2018-01-31 05:00:00 2018-01-31 05:00:00 B
8: 1184695414 2018-01-23 10:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00 B
9: 1184695414 2018-01-23 10:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00 B
10: 1184695414 2018-01-05 09:00:00 2018-01-05 22:00:00 2018-01-05 09:00:00 A
11: 1184695414 2018-01-05 09:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00 <NA>
12: 1184695414 2018-01-05 19:00:00 2018-01-05 22:00:00 2018-01-05 19:00:00 A
13: 1184695414 2018-01-05 19:00:00 2018-01-05 09:00:00 2018-01-05 09:00:00 B
14: 1184695414 2018-01-21 18:00:00 2018-01-21 16:00:00 2018-01-21 16:00:00 B
15: 1184695414 2018-01-21 18:00:00 2018-01-21 14:00:00 2018-01-21 14:00:00 B