在R中跟踪具有滞后的属性_R_Dplyr

在R中跟踪具有滞后的属性

在R中跟踪具有滞后的属性,r,dplyr,R,Dplyr,我试图跟踪列中某个特定属性的最后一个实例，但我似乎无法让滞后为我工作。例如，如果我有一个事务数据库，我希望保留购买的特定项目子集（在本例中为钻取）的最后一个运行标志，以便： Transactions <- data.table(Customer=c("A01","A01","A01","A01","A01","A01"), PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/201

我试图跟踪列中某个特定属性的最后一个实例，但我似乎无法让滞后为我工作。例如，如果我有一个事务数据库，我希望保留购买的特定项目子集（在本例中为钻取）的最后一个运行标志，以便：

Transactions <- data.table(Customer=c("A01","A01","A01","A01","A01","A01"), 
                           PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018"),
                           Purchase=c("DrillA, Bit10", "Bit11","Bit20","Bit21","DrillZ, Bit4", "Bit6"))

这似乎适用于包含“DrillA/DrillZ”的事务之后的事务，但不能保持“标志”继续

有没有更好的方法来组织这个？非常感谢您的帮助

数据

Transactions <- data.frame(Customer=c("A01","A01","A01","A01","A01","A01"), 
                           PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018"),
                           Purchase=c("DrillA, Bit10", "Bit11","Bit20","Bit21","DrillZ, Bit4", "Bit6"))

选项2（dplyr和提取钻孔的函数）

库（dplyr）
图书馆（动物园）
#用于从字符串中提取钻取值的矢量化函数
#（使用逗号分割字符串）
GetDrill=function（x）{y=unlist（strsplit（x，split=“，”））
ifelse（sum（grepl（“Drill”，y）），y[grepl（“Drill，y）]，NA）}
GetDrill=矢量化（GetDrill）
交易%>%
安排（客户、购买日期）%>%
分组依据（客户）%>%
mutate（LastDrill=na.locf（GetDrill（as.character（Purchase）））%>%
解组（）
##tibble:6 x 4
#客户购买日期购买日期
#                            
#1 A01 2018年1月1日钻孔A，钻头10钻孔A
#2 A01 2018年1月2日Bit11钻孔A
#3 A01 1/3/2018钻头20钻孔A
#4 A01 1/4/2018 Bit21钻孔A
#5 A01 1/5/2018钻取，钻头4钻取
#6 A01 1/6/2018 Bit6钻井Z

Transactions %>% 
  arrange(Customer, PurchaseDate) %>%
  group_by(Customer) %>%
  mutate(LastDrill = "") %>%
  mutate(LastDrill = case_when(grepl("DrillA", Purchase) ~ "DrillA",
                               grepl("DrillZ", Purchase) ~ "DrillZ",
                               TRUE ~ lag(LastDrill, 1, order_by=PurchaseDate)))

Transactions <- data.frame(Customer=c("A01","A01","A01","A01","A01","A01"), 
                           PurchaseDate=c("1/1/2018","1/2/2018","1/3/2018","1/4/2018","1/5/2018","1/6/2018"),
                           Purchase=c("DrillA, Bit10", "Bit11","Bit20","Bit21","DrillZ, Bit4", "Bit6"))

library(tidyverse)

Transactions %>% 
  arrange(Customer, PurchaseDate) %>%
  group_by(Customer) %>%
  separate_rows(Purchase) %>%                          # separate column into multiple rows (i.e. one event per row)
  mutate(flag = cumsum(grepl("Drill", Purchase))) %>%  # group rows based on when drills occur
  group_by(flag, add = T) %>%                          
  mutate(LastDrill = first(Purchase)) %>%              # get the first value in a new column (should always be a drill)
  ungroup() %>%
  select(-flag) %>%
  group_by(Customer, PurchaseDate, LastDrill) %>%
  summarise(Purchase = paste0(Purchase, collapse = ", ")) %>% # go back to your original shape
  ungroup()

#   Customer PurchaseDate      Purchase LastDrill
# 1      A01     1/1/2018 DrillA, Bit10    DrillA
# 2      A01     1/2/2018         Bit11    DrillA
# 3      A01     1/3/2018         Bit20    DrillA
# 4      A01     1/4/2018         Bit21    DrillA
# 5      A01     1/5/2018  DrillZ, Bit4    DrillZ
# 6      A01     1/6/2018          Bit6    DrillZ

library(dplyr)
library(zoo)

# vectorised function to extract the drill value from a string
# (uses comma to split the string)
GetDrill = function(x) { y = unlist(strsplit(x, split = ","))
                         ifelse(sum(grepl("Drill", y)), y[grepl("Drill", y)], NA) }
GetDrill = Vectorize(GetDrill)


Transactions %>% 
  arrange(Customer, PurchaseDate) %>%
  group_by(Customer) %>%
  mutate(LastDrill =  na.locf(GetDrill(as.character(Purchase)))) %>%
  ungroup()

# # A tibble: 6 x 4
#   Customer PurchaseDate Purchase      LastDrill
#   <fct>    <fct>        <fct>         <chr>    
# 1 A01      1/1/2018     DrillA, Bit10 DrillA   
# 2 A01      1/2/2018     Bit11         DrillA   
# 3 A01      1/3/2018     Bit20         DrillA   
# 4 A01      1/4/2018     Bit21         DrillA   
# 5 A01      1/5/2018     DrillZ, Bit4  DrillZ   
# 6 A01      1/6/2018     Bit6          DrillZ