使用dplyr根据R中的类型和滚动日期进行计数和标记
我的问题类似于,我已经将其用作参考,但没有成功地将其运用到我需要做的事情中 我有如下数据:使用dplyr根据R中的类型和滚动日期进行计数和标记,r,dplyr,R,Dplyr,我的问题类似于,我已经将其用作参考,但没有成功地将其运用到我需要做的事情中 我有如下数据: a <- data.table("TYPE" = c("A", "A", "B", "B", "C", "C", "C", "C", "D", "D", "D", "D"), "DATE" = c("4/20/2018 11:47", "
a <- data.table("TYPE" = c("A", "A", "B", "B",
"C", "C", "C", "C",
"D", "D", "D", "D"),
"DATE" = c("4/20/2018 11:47",
"4/25/2018 7:21",
"4/15/2018 6:11",
"4/19/2018 4:22",
"4/15/2018 17:46",
"4/16/2018 11:59",
"4/20/2018 7:50",
"4/26/2018 2:55",
"4/27/2018 11:46",
"4/27/2018 13:03",
"4/20/2018 7:31",
"4/22/2018 9:45"),
"CLASS" = c(1, 2, 3, 4,
1, 2, 3, 4,
1, 2, 3, 4))
现在我尝试使用TYPE
列和YMD
列来生成一个新列。以下是我试图达到的标准:1) 维护原始数据集中的所有列
2) 创建一个名为say
EVENTS
3) 对于每个
类型
,如果在30天内发生的次数超过n次
,则在事件
列中为每个类型
和YMD
添加Y
,否则。(注意这是针对n
唯一日期,因此必须在30天内有n
唯一天数才能符合条件)
如果n=4
,这将是预期的输出:
这是我的一个示例的结尾,但它不考虑唯一天数,也不保留表中的所有列:
a %>% mutate(DATE = as.POSIXct(DATE, format = "%m/%d/%Y %H:%M")) %>%
inner_join(.,., by="TYPE") %>%
group_by(TYPE, DATE.x) %>%
summarise(FLAG = as.integer(sum(abs((DATE.x-DATE.y)/(24*60*60))<=30)>=4))
新的更新预期输出为:
以下是dplyr的解决方案: 基于OP编辑的更新
library(dplyr)
library(lubridate)
a <- data.frame("TYPE" = c("A", "A", "B", "B",
"C", "C", "C", "C",
"D", "D", "D", "D",
"D", "D", "D", "D"),
"DATE" = c("4/20/2018 11:47",
"4/25/2018 7:21",
"4/15/2018 6:11",
"4/19/2018 4:22",
"4/15/2018 17:46",
"4/16/2018 11:59",
"4/20/2018 7:50",
"4/26/2018 2:55",
"4/27/2018 11:46",
"4/27/2018 13:03",
"4/20/2018 7:31",
"4/22/2018 9:45",
"6/01/2018 9:07",
"6/03/2018 12:34",
"6/07/2018 1:57",
"6/10/2018 2:22"),
"CLASS" = c(1, 2, 3, 4,
1, 2, 3, 4,
1, 2, 3, 4,
1, 2, 3, 4))
# a function to flag rows that are 4th or more within window w
count_window <- function(df, date, w, type){
min_date <- date - w
df2 <- df %>% filter(TYPE == type, YMD >= min_date, YMD <= date)
out <- n_distinct(df2$YMD)
res <- ifelse(out >= 4, 1, 0)
return(res)
}
v_count_window <- Vectorize(count_window, vectorize.args = c("date","type"))
res <- a %>% mutate(DATE = as.POSIXct(DATE, format = "%m/%d/%Y %H:%M")) %>%
mutate(YMD = date(DATE)) %>%
arrange(TYPE, YMD) %>%
#group_by(TYPE) %>%
mutate(min_date = YMD - 30,
count = v_count_window(., YMD, 30, TYPE)) %>%
group_by(TYPE) %>%
mutate(FLAG = case_when(
any(count == 1) & YMD >= min_date[match(1,count)] ~ 1,
TRUE ~ 0
))%>%
select(nms,FLAG)
库(dplyr)
图书馆(lubridate)
a%
分组依据(类型)%>%
变异(标志=情况)(
任意(计数==1)&YMD>=最小日期[匹配(1,计数)]~1,
真~0
))%>%
选择(nms,标志)
我不知道如何在自定义函数中使用组,所以我将按类型筛选硬编码到函数中 使用
数据。表如下所示:
a[,DATE:=as.Date(a$DATE,format="%m/%d/%Y %H:%M")]
a <- a[order(TYPE, DATE), ]
fun1 <- function(x,n){ #Creating a function for any n
x[,.(DATE,CLASS, EVENTS=if((max(DATE)-min(DATE))<=30 #first condition
& (length(unique(DATE)))>=n) #second condition
1 else 0),by=TYPE]
}
fun1(a,4)
TYPE DATE CLASS EVENTS
1: A 2018-04-20 1 0
2: A 2018-04-25 2 0
3: B 2018-04-15 3 0
4: B 2018-04-19 4 0
5: C 2018-04-15 1 1
6: C 2018-04-16 2 1
7: C 2018-04-20 3 1
8: C 2018-04-26 4 1
9: D 2018-04-20 3 0
10: D 2018-04-22 4 0
11: D 2018-04-27 1 0
12: D 2018-04-27 2 0
a[,DATE:=as.DATE(a$DATE,format=“%m/%d/%Y%H:%m”)]
a是否有从原始数据集中选择所有列的快捷方式a
?(这样你就不必把它们都打出来了)?你可以做一些像nms这样的事情,不要小气,但如果它起作用,你能接受我的答案吗?除非您希望获得data.table版本?@KAS基于您的情况,否则您在“30天内”(D类中发生的时间范围为51天)2018年6月1日、2018年3月6日、2018年7月6日的D类事件不会超过4次,2018年6月10日符合30天滚动窗口中的1套,因为有4个不同的日期都发生在30天内。基本上,它需要从第一个日期开始,检查“从该日期起30天内是否有4个日期?是或否”,然后移动到下一个日期。因此,滚动日期窗口为30天。
library(dplyr)
library(lubridate)
a <- data.frame("TYPE" = c("A", "A", "B", "B",
"C", "C", "C", "C",
"D", "D", "D", "D",
"D", "D", "D", "D"),
"DATE" = c("4/20/2018 11:47",
"4/25/2018 7:21",
"4/15/2018 6:11",
"4/19/2018 4:22",
"4/15/2018 17:46",
"4/16/2018 11:59",
"4/20/2018 7:50",
"4/26/2018 2:55",
"4/27/2018 11:46",
"4/27/2018 13:03",
"4/20/2018 7:31",
"4/22/2018 9:45",
"6/01/2018 9:07",
"6/03/2018 12:34",
"6/07/2018 1:57",
"6/10/2018 2:22"),
"CLASS" = c(1, 2, 3, 4,
1, 2, 3, 4,
1, 2, 3, 4,
1, 2, 3, 4))
# a function to flag rows that are 4th or more within window w
count_window <- function(df, date, w, type){
min_date <- date - w
df2 <- df %>% filter(TYPE == type, YMD >= min_date, YMD <= date)
out <- n_distinct(df2$YMD)
res <- ifelse(out >= 4, 1, 0)
return(res)
}
v_count_window <- Vectorize(count_window, vectorize.args = c("date","type"))
res <- a %>% mutate(DATE = as.POSIXct(DATE, format = "%m/%d/%Y %H:%M")) %>%
mutate(YMD = date(DATE)) %>%
arrange(TYPE, YMD) %>%
#group_by(TYPE) %>%
mutate(min_date = YMD - 30,
count = v_count_window(., YMD, 30, TYPE)) %>%
group_by(TYPE) %>%
mutate(FLAG = case_when(
any(count == 1) & YMD >= min_date[match(1,count)] ~ 1,
TRUE ~ 0
))%>%
select(nms,FLAG)
a[,DATE:=as.Date(a$DATE,format="%m/%d/%Y %H:%M")]
a <- a[order(TYPE, DATE), ]
fun1 <- function(x,n){ #Creating a function for any n
x[,.(DATE,CLASS, EVENTS=if((max(DATE)-min(DATE))<=30 #first condition
& (length(unique(DATE)))>=n) #second condition
1 else 0),by=TYPE]
}
fun1(a,4)
TYPE DATE CLASS EVENTS
1: A 2018-04-20 1 0
2: A 2018-04-25 2 0
3: B 2018-04-15 3 0
4: B 2018-04-19 4 0
5: C 2018-04-15 1 1
6: C 2018-04-16 2 1
7: C 2018-04-20 3 1
8: C 2018-04-26 4 1
9: D 2018-04-20 3 0
10: D 2018-04-22 4 0
11: D 2018-04-27 1 0
12: D 2018-04-27 2 0