如何使用R中的Dataframes根据datetimes统计随时间变化的事件并按条件分组
我面临的挑战是通过对两个不同的数据集进行数据争用来构建一个表 数据集A:具有在服装店购买的信息,变量为:客户名称、购买日期、代理和在t期间购买的产品如何使用R中的Dataframes根据datetimes统计随时间变化的事件并按条件分组,r,dataframe,datetime,group-by,data-wrangling,R,Dataframe,Datetime,Group By,Data Wrangling,我面临的挑战是通过对两个不同的数据集进行数据争用来构建一个表 数据集A:具有在服装店购买的信息,变量为:客户名称、购买日期、代理和在t期间购买的产品 df1 <- tibble::tribble( ~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE, "Karen", "M_14", "X_1", "8-25-20021 18:21:28", &quo
df1 <- tibble::tribble(
~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE,
"Karen", "M_14", "X_1", "8-25-20021 18:21:28",
"Jean", "M_78", "X_3", "8-26-20021 18:11:06",
"Jean", "M_71", "X_4", "8-26-20021 18:21:01",
"Jean", "M_64", "X_4", "8-27-20021 20:21:59",
"Keith", "M_57", "X_4", "8-27-20021 20:21:02",
"Alba", "M_50", "X_1", "8-28-20021 20:21:03",
"Alba", "M_43", "X_3", "8-29-20021 20:21:04",
"Alex", "M_36", "X_2", "8-25-20021 20:21:05"
)
这个结果是正确的,因为在记录中。。。这将是Jean的第三个电话,最后一个电话的类型是8-29-20021 20:21:59的CX_服务,她最后一次购买的产品是8-27-20021 20:21:59的M_64。我相信你可能在这一年犯了一个错误,所以我在这一年(2021年)删除了多余的零。我看到您正在使用tibbles,因此我将提供一个
tidyverse
方法来解决这个问题
提供的代码的思想是首先分别处理TIBLES,然后通过公共分母名称将它们连接起来。
这应该做到:
library(dplyr)
df1 <- tibble::tribble(
~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE,
"Karen", "M_14", "X_1", "8-25-2021 18:21:28",
"Jean", "M_78", "X_3", "8-26-2021 18:11:06",
"Jean", "M_71", "X_4", "8-26-2021 18:21:01",
"Jean", "M_64", "X_4", "8-27-2021 20:21:59",
"Keith", "M_57", "X_4", "8-27-2021 20:21:02",
"Alba", "M_50", "X_1", "8-28-2021 20:21:03",
"Alba", "M_43", "X_3", "8-29-2021 20:21:04",
"Alex", "M_36", "X_2", "8-25-2021 20:21:05"
)
df2 <- tibble::tribble(
~NAME, ~TYPE, ~DATE_OF_CALL,
"Karen", "COMPLAIN", "8-26-2021 18:21:28",
"Jean", "CX_SERVICE", "8-27-2021 18:11:06",
"Jean", "COMPLAIN", "8-28-2021 18:21:01",
"Jean", "CX_SERVICE", "8-29-2021 20:21:59",
"Keith", "CX_SERVICE", "8-29-2021 20:21:02",
"Alba", "COMPLAIN", "8-30-2021 20:21:03",
"Alex", "CX_SERVICE", "8-25-2021 21:21:05",
)
(df1_mod <- df1 %>%
mutate(DATE_PURCHASE = as.POSIXct(DATE_PURCHASE, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(product = PRODUCT[DATE_PURCHASE == max(DATE_PURCHASE)], # retrieve product with the most recent date_purchase
DATE_PURCHASE = max(DATE_PURCHASE), # retrieve most recent date_purchase
.groups = "drop"))
#> # A tibble: 5 x 3
#> NAME product DATE_PURCHASE
#> <chr> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04
#> 2 Alex M_36 2021-08-25 20:21:05
#> 3 Jean M_64 2021-08-27 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02
(df2_mod <- df2 %>%
mutate(DATE_OF_CALL = as.POSIXct(DATE_OF_CALL, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(`x attempt` = n(), # retrieve amount of calls, which is n() (the amount of rows in the group)
TYPE = TYPE[DATE_OF_CALL == max(DATE_OF_CALL)], # retrieve type of call from most recent call
DATE_OF_CALL = max(DATE_OF_CALL), # retrieve most recent date_of_call
.groups = "drop"))
#> # A tibble: 5 x 4
#> NAME `x attempt` TYPE DATE_OF_CALL
#> <chr> <int> <chr> <dttm>
#> 1 Alba 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith 1 CX_SERVICE 2021-08-29 20:21:02
left_join(df1_mod, df2_mod, by = "NAME")
#> # A tibble: 5 x 6
#> NAME product DATE_PURCHASE `x attempt` TYPE DATE_OF_CALL
#> <chr> <chr> <dttm> <int> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex M_36 2021-08-25 20:21:05 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean M_64 2021-08-27 20:21:59 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02 1 CX_SERVICE 2021-08-29 20:21:02
Created on 2021-04-10 by the reprex package (v0.3.0)
库(dplyr)
df1%
汇总(产品=产品[购买日期==最大(购买日期)],#检索最近购买日期的产品
购买日期=最大(购买日期),#检索最近购买日期
.groups=“drop”))
#>#tibble:5 x 3
#>名称产品购买日期
#>
#>1阿尔巴穆43 2021-08-29 20:21:04
#>亚历克斯M_36 2021-08-25 20:21:05
#>3 Jean M_64 2021-08-27 20:21:59
#>4卡伦M_14 2021-08-2518:21:28
#>基思M_57 2021-08-27 20:21:02
(df2_mod%
变异(调用日期=as.POSIXct(调用日期,格式=“%m-%d-%Y%H:%m:%S”))%>%
分组单位(名称)%>%
摘要(`x trunt`=n(),#检索调用量,即n()(组中的行数)
TYPE=TYPE[DATE_OF_CALL==max(DATE_OF_CALL)],#从最近的通话中检索通话类型
通话日期=最大值(通话日期),#检索最近通话日期
.groups=“drop”))
#>#A tibble:5 x 4
#>NAME`x trunt`键入通话日期
#>
#>阿尔巴投诉2021-08-30 20:21:03
#>亚历克斯1 CX_服务2021-08-25 21:21:05
#>吉安3 CX_服务2021-08-29 20:21:59
#>凯伦1号投诉2021-08-26 18:21:28
#>基思1号CX_服务2021-08-29 20:21:02
左联接(df1、df2、by=“NAME”)
#>#tibble:5 x 6
#>名称产品日期\购买`x尝试`TYPE日期\电话
#>
#>阿尔巴穆43 2021-08-29 20:21:04投诉2021-08-30 20:21:03
#>Alex M_36 2021-08-25 20:21:05 CX_服务2021-08-25 21:21:05
#>3 Jean M_64 2021-08-27 20:21:59 3 CX_服务2021-08-29 20:21:59
#>4卡伦·穆14 2021-08-25 18:21:28投诉2021-08-26 18:21:28
#>Keith M_57 2021-08-27 20:21:02 CX_服务2021-08-29 20:21:02
由reprex软件包(v0.3.0)于2021年4月10日创建
谢谢,我认为这要求每次通话输出一行,但我认为上面的回答会给每个客户返回一行,我的回答要详细得多,但每次通话都会返回上一次通话和购买的信息,这可能不是你想要的,但我还是写了出来:-)
图书馆(tidyverse)
图书馆(lubridate)
df1_我不得不赞扬你如此善良,帮助我解决问题。你不知道这对我有多重要,我有多感激。我已经为这个问题工作了好几天,我只是转到你的页面,继续从你的其他回复中学习。非常感谢
NAME | x attempt | product | TYPE | DATE_CALL | DATE_PURCHASE |
Jean| | 3 | M_64 |CX_SERVICE | 8-29-20021 20:21:59 | 8-27-20021 20:21:59 |
library(dplyr)
df1 <- tibble::tribble(
~NAME, ~PRODUCT, ~AGENT, ~DATE_PURCHASE,
"Karen", "M_14", "X_1", "8-25-2021 18:21:28",
"Jean", "M_78", "X_3", "8-26-2021 18:11:06",
"Jean", "M_71", "X_4", "8-26-2021 18:21:01",
"Jean", "M_64", "X_4", "8-27-2021 20:21:59",
"Keith", "M_57", "X_4", "8-27-2021 20:21:02",
"Alba", "M_50", "X_1", "8-28-2021 20:21:03",
"Alba", "M_43", "X_3", "8-29-2021 20:21:04",
"Alex", "M_36", "X_2", "8-25-2021 20:21:05"
)
df2 <- tibble::tribble(
~NAME, ~TYPE, ~DATE_OF_CALL,
"Karen", "COMPLAIN", "8-26-2021 18:21:28",
"Jean", "CX_SERVICE", "8-27-2021 18:11:06",
"Jean", "COMPLAIN", "8-28-2021 18:21:01",
"Jean", "CX_SERVICE", "8-29-2021 20:21:59",
"Keith", "CX_SERVICE", "8-29-2021 20:21:02",
"Alba", "COMPLAIN", "8-30-2021 20:21:03",
"Alex", "CX_SERVICE", "8-25-2021 21:21:05",
)
(df1_mod <- df1 %>%
mutate(DATE_PURCHASE = as.POSIXct(DATE_PURCHASE, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(product = PRODUCT[DATE_PURCHASE == max(DATE_PURCHASE)], # retrieve product with the most recent date_purchase
DATE_PURCHASE = max(DATE_PURCHASE), # retrieve most recent date_purchase
.groups = "drop"))
#> # A tibble: 5 x 3
#> NAME product DATE_PURCHASE
#> <chr> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04
#> 2 Alex M_36 2021-08-25 20:21:05
#> 3 Jean M_64 2021-08-27 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02
(df2_mod <- df2 %>%
mutate(DATE_OF_CALL = as.POSIXct(DATE_OF_CALL, format = "%m-%d-%Y %H:%M:%S")) %>%
group_by(NAME) %>%
summarise(`x attempt` = n(), # retrieve amount of calls, which is n() (the amount of rows in the group)
TYPE = TYPE[DATE_OF_CALL == max(DATE_OF_CALL)], # retrieve type of call from most recent call
DATE_OF_CALL = max(DATE_OF_CALL), # retrieve most recent date_of_call
.groups = "drop"))
#> # A tibble: 5 x 4
#> NAME `x attempt` TYPE DATE_OF_CALL
#> <chr> <int> <chr> <dttm>
#> 1 Alba 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith 1 CX_SERVICE 2021-08-29 20:21:02
left_join(df1_mod, df2_mod, by = "NAME")
#> # A tibble: 5 x 6
#> NAME product DATE_PURCHASE `x attempt` TYPE DATE_OF_CALL
#> <chr> <chr> <dttm> <int> <chr> <dttm>
#> 1 Alba M_43 2021-08-29 20:21:04 1 COMPLAIN 2021-08-30 20:21:03
#> 2 Alex M_36 2021-08-25 20:21:05 1 CX_SERVICE 2021-08-25 21:21:05
#> 3 Jean M_64 2021-08-27 20:21:59 3 CX_SERVICE 2021-08-29 20:21:59
#> 4 Karen M_14 2021-08-25 18:21:28 1 COMPLAIN 2021-08-26 18:21:28
#> 5 Keith M_57 2021-08-27 20:21:02 1 CX_SERVICE 2021-08-29 20:21:02
Created on 2021-04-10 by the reprex package (v0.3.0)
library(tidyverse)
library(lubridate)
df1_purchases <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
product = paste0("Product_",sample(1:500, replace = TRUE, 100)),
agent = paste0("Agent_Name_", sample(LETTERS[1:5], replace = TRUE, 1000)),
purchase_datetime = ymd_hms(paste0(sample(2000:2019,
replace = TRUE,
1000),
"-",
sample(1:12,
replace = TRUE,
1000),
"-",
sample(1:28,
replace = TRUE,
1000),
"-",
sample(1:24,
replace = TRUE,
1000),
"-",
sample(1:59,
replace = TRUE,
1000),
"-",
sample(1:59, replace = TRUE,
1000))))
df2_calls <- data.frame(cust_name = paste0("Name_", sample(LETTERS, replace = TRUE, 1000)),
type = paste0("calltype_", sample(c("Complaint", "Service"), replace = TRUE, 1000)),
call_datetime = ymd_hms(paste0(sample(2000:2019,
replace = TRUE,
1000),
"-",
sample(1:12,
replace = TRUE,
1000),
"-",
sample(1:28,
replace = TRUE,
1000),
"-",
sample(0:24,
replace = TRUE,
1000),
"-",
sample(0:59,
replace = TRUE,
10000),
"-",
sample(0:59, replace = TRUE,
1000))))
output_df <- data.frame(cust_name = NULL,
prev_calls = NULL,
prev_purchase_item = NULL,
prev_call_datetime = NULL,
prev_purchase_datetime = NULL)
f <- function(df2_calls, df1_purchases, cust, this_call_datetime) {
df_calls <- df2_calls %>% filter(cust_name == cust)
df_purchases <- df1_purchases %>% filter(cust_name == cust)
if(sum(df_calls$call_datetime < this_call_datetime) < 1) {
cust_name <- cust
prev_calls <- 0
prev_call_type <- NA
prev_call_date_time <- NA
} else {
df_calls <- df2_calls %>% filter(call_datetime < this_call_datetime)
most_recent_call_number_for_cust <- df_calls %>%
pull(call_datetime) %>%
which.max()
cust_name <- cust
prev_calls <- length(df_calls$cust_name)
prev_call_type <- df_calls$type[most_recent_call_number_for_cust]
prev_call_date_time <- max(df_calls$call_datetime)
}
if(sum(df_purchases$purchase_datetime < this_call_datetime) < 1) {
prev_purchase_item <- NA
prev_purchase_datetime <- NA
} else {
most_recent_purchase_for_cust <- df_purchases %>%
filter(purchase_datetime < this_call_datetime) %>%
pull(purchase_datetime) %>%
which.max()
df_purchases <- df_purchases[most_recent_purchase_for_cust,]
prev_purchase_item <- df_purchases$product
prev_purchase_datetime <- df_purchases$purchase_datetime
}
new_row <- data.frame(cust_name = cust,
prev_calls = prev_calls,
prev_purchase_item = prev_purchase_item,
prev_call_datetime = prev_call_date_time,
prev_purchase_datetime = prev_purchase_datetime,
this_call_datetime = this_call_datetime)
new_row
}
number_of_call_rows <- nrow(df2_calls)
for(i in 1:number_of_call_rows) {
output_df <- rbind(output_df,
f(df2_calls,
df1_purchases,
df2_calls$cust_name[i],
df2_calls$call_datetime[i]))
}
glimpse(output_df)