在R中跟踪具有滞后的属性
我试图跟踪列中某个特定属性的最后一个实例,但我似乎无法让滞后为我工作。例如,如果我有一个事务数据库,我希望保留购买的特定项目子集(在本例中为钻取)的最后一个运行标志,以便:在R中跟踪具有滞后的属性,r,dplyr,R,Dplyr,我试图跟踪列中某个特定属性的最后一个实例,但我似乎无法让滞后为我工作。例如,如果我有一个事务数据库,我希望保留购买的特定项目子集(在本例中为钻取)的最后一个运行标志,以便: Transactions <- data.table(Customer=c("A01","A01","A01","A01","A01","A01"), PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/201
Transactions <- data.table(Customer=c("A01","A01","A01","A01","A01","A01"),
PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018"),
Purchase=c("DrillA, Bit10", "Bit11","Bit20","Bit21","DrillZ, Bit4", "Bit6"))
这似乎适用于包含“DrillA/DrillZ”的事务之后的事务,但不能保持“标志”继续
有没有更好的方法来组织这个?非常感谢您的帮助 数据
Transactions <- data.frame(Customer=c("A01","A01","A01","A01","A01","A01"),
PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018"),
Purchase=c("DrillA, Bit10", "Bit11","Bit20","Bit21","DrillZ, Bit4", "Bit6"))
选项2(dplyr和提取钻孔的函数)
库(dplyr)
图书馆(动物园)
#用于从字符串中提取钻取值的矢量化函数
#(使用逗号分割字符串)
GetDrill=function(x){y=unlist(strsplit(x,split=“,”))
ifelse(sum(grepl(“Drill”,y)),y[grepl(“Drill,y)],NA)}
GetDrill=矢量化(GetDrill)
交易%>%
安排(客户、购买日期)%>%
分组依据(客户)%>%
mutate(LastDrill=na.locf(GetDrill(as.character(Purchase)))%>%
解组()
##tibble:6 x 4
#客户购买日期购买日期
#
#1 A01 2018年1月1日钻孔A,钻头10钻孔A
#2 A01 2018年1月2日Bit11钻孔A
#3 A01 1/3/2018钻头20钻孔A
#4 A01 1/4/2018 Bit21钻孔A
#5 A01 1/5/2018钻取,钻头4钻取
#6 A01 1/6/2018 Bit6钻井Z
Transactions %>%
arrange(Customer, PurchaseDate) %>%
group_by(Customer) %>%
mutate(LastDrill = "") %>%
mutate(LastDrill = case_when(grepl("DrillA", Purchase) ~ "DrillA",
grepl("DrillZ", Purchase) ~ "DrillZ",
TRUE ~ lag(LastDrill, 1, order_by=PurchaseDate)))
Transactions <- data.frame(Customer=c("A01","A01","A01","A01","A01","A01"),
PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018"),
Purchase=c("DrillA, Bit10", "Bit11","Bit20","Bit21","DrillZ, Bit4", "Bit6"))
library(tidyverse)
Transactions %>%
arrange(Customer, PurchaseDate) %>%
group_by(Customer) %>%
separate_rows(Purchase) %>% # separate column into multiple rows (i.e. one event per row)
mutate(flag = cumsum(grepl("Drill", Purchase))) %>% # group rows based on when drills occur
group_by(flag, add = T) %>%
mutate(LastDrill = first(Purchase)) %>% # get the first value in a new column (should always be a drill)
ungroup() %>%
select(-flag) %>%
group_by(Customer, PurchaseDate, LastDrill) %>%
summarise(Purchase = paste0(Purchase, collapse = ", ")) %>% # go back to your original shape
ungroup()
# Customer PurchaseDate Purchase LastDrill
# 1 A01 1/1/2018 DrillA, Bit10 DrillA
# 2 A01 1/2/2018 Bit11 DrillA
# 3 A01 1/3/2018 Bit20 DrillA
# 4 A01 1/4/2018 Bit21 DrillA
# 5 A01 1/5/2018 DrillZ, Bit4 DrillZ
# 6 A01 1/6/2018 Bit6 DrillZ
library(dplyr)
library(zoo)
# vectorised function to extract the drill value from a string
# (uses comma to split the string)
GetDrill = function(x) { y = unlist(strsplit(x, split = ","))
ifelse(sum(grepl("Drill", y)), y[grepl("Drill", y)], NA) }
GetDrill = Vectorize(GetDrill)
Transactions %>%
arrange(Customer, PurchaseDate) %>%
group_by(Customer) %>%
mutate(LastDrill = na.locf(GetDrill(as.character(Purchase)))) %>%
ungroup()
# # A tibble: 6 x 4
# Customer PurchaseDate Purchase LastDrill
# <fct> <fct> <fct> <chr>
# 1 A01 1/1/2018 DrillA, Bit10 DrillA
# 2 A01 1/2/2018 Bit11 DrillA
# 3 A01 1/3/2018 Bit20 DrillA
# 4 A01 1/4/2018 Bit21 DrillA
# 5 A01 1/5/2018 DrillZ, Bit4 DrillZ
# 6 A01 1/6/2018 Bit6 DrillZ