在R中随时间跟踪队列
我有一个用户ID和交易发生月份的示例数据集。我的目标是按月计算有多少原始用户进行了交易。换句话说,1月份有多少新用户在2月、3月和4月也进行了交易。2月份有多少新用户在3月和4月进行了交易,依此类推在R中随时间跟踪队列,r,dplyr,time-series,zoo,R,Dplyr,Time Series,Zoo,我有一个用户ID和交易发生月份的示例数据集。我的目标是按月计算有多少原始用户进行了交易。换句话说,1月份有多少新用户在2月、3月和4月也进行了交易。2月份有多少新用户在3月和4月进行了交易,依此类推 > data date user_id 1 Jan 2017 1 2 Jan 2017 2 3 Jan 2017 3 4 Jan 2017 4 5 Jan 2017 5 6 Feb 2017 1
> data
date user_id
1 Jan 2017 1
2 Jan 2017 2
3 Jan 2017 3
4 Jan 2017 4
5 Jan 2017 5
6 Feb 2017 1
7 Feb 2017 3
8 Feb 2017 5
9 Feb 2017 7
10 Feb 2017 9
11 Mar 2017 2
12 Mar 2017 4
13 Mar 2017 6
14 Mar 2017 8
15 Mar 2017 10
16 Apr 2017 1
17 Apr 2017 3
18 Apr 2017 6
19 Apr 2017 9
20 Apr 2017 12
此数据集的输出如下所示:
> output
Jan Feb Mar Apr
Jan 5 3 2 2
Feb NA 2 0 1
Mar NA NA 3 1
Apr NA NA NA 1
到目前为止,我能想到的唯一方法是分割数据集,然后计算每个月的唯一ID,这些ID在前几个月中不存在,但是这种方法很冗长,不适合有很多个月的大型数据集
subsets <-split(data, data$date, drop=TRUE)
for (i in 1:length(subsets)) {
assign(paste0("M", i), as.data.frame(subsets[[i]]))
}
M1_ids <- unique(M1$user_id)
M2_ids <- unique(M2$user_id)
M3_ids <- unique(M3$user_id)
M4_ids <- unique(M4$user_id)
M2_ids <- unique(setdiff(M2_ids, unique(M1_ids)))
M3_ids <- unique(setdiff(M3_ids, unique(c(M2_ids, M1_ids))))
M4_ids <- unique(setdiff(M4_ids, unique(c(M3_ids, M2_ids, M1_ids))))
以及样本数据:
> dput(data)
structure(list(date = structure(c(2017, 2017, 2017, 2017, 2017,
2017.08333333333, 2017.08333333333, 2017.08333333333, 2017.08333333333,
2017.08333333333, 2017.16666666667, 2017.16666666667, 2017.16666666667,
2017.16666666667, 2017.16666666667, 2017.25, 2017.25, 2017.25,
2017.25, 2017.25), class = "yearmon"), user_id = c(1L, 2L, 3L,
4L, 5L, 1L, 3L, 5L, 7L, 9L, 2L, 4L, 6L, 8L, 10L, 1L, 3L, 6L,
9L, 12L)), .Names = c("date", "user_id"), row.names = c(NA, -20L
), class = "data.frame")
下面是一个例子:
library(data.table)
library(zoo)
data <- structure(list(date = structure(c(2017, 2017, 2017, 2017, 2017,
2017.08333333333, 2017.08333333333, 2017.08333333333, 2017.08333333333,
2017.08333333333, 2017.16666666667, 2017.16666666667, 2017.16666666667,
2017.16666666667, 2017.16666666667, 2017.25, 2017.25, 2017.25,
2017.25, 2017.25), class = "yearmon"), user_id = c(1L, 2L, 3L,
4L, 5L, 1L, 3L, 5L, 7L, 9L, 2L, 4L, 6L, 8L, 10L, 1L, 3L, 6L,
9L, 12L)), .Names = c("date", "user_id"), row.names = c(NA, -20L
), class = "data.frame")
data <- data[c(1,1:nrow(data)),]
setDT(data)
(cohorts <- dcast(unique(data)[,cohort:=min(date),by=user_id],cohort~date))
# cohort Jan 2017 Feb 2017 Mrz 2017 Apr 2017
# 1: Jan 2017 5 3 2 2
# 2: Feb 2017 0 2 0 1
# 3: Mrz 2017 0 0 3 1
# 4: Apr 2017 0 0 0 1
m <- as.matrix(cohorts[,-1])
rownames(m) <- cohorts[[1]]
m[lower.tri(m)] <- NA
names(dimnames(m)) <- c("cohort", "yearmon")
m
# yearmon
# cohort Jan 2017 Feb 2017 Mrz 2017 Apr 2017
# Jan 2017 5 3 2 2
# Feb 2017 NA 2 0 1
# Mrz 2017 NA NA 3 1
# Apr 2017 NA NA NA 1
库(data.table)
图书馆(动物园)
数据这在Tidyverse函数中也是可能的:
library(tidyverse)
library(lubridate)
transactions <- tibble(
month=ymd(c("2017-01-01", "2017-01-01", "2017-02-01", "2017-02-01", "2017-03-01")),
user_id=c(1, 2, 1, 3, 3)
)
# Jan 1
# Jan 2
# Feb 1
# Feb 3
# Mar 1
# mark the cohort of the users
users <- transactions %>%
arrange(month, user_id) %>%
group_by(user_id) %>%
top_n(-1, month) %>%
# date of the first transaction
rename(cohort = month)
users
transactions %>%
group_by(month, user_id) %>%
distinct() %>%
left_join(users, by = 'user_id') %>%
xtabs(~ cohort + month, data = .)
# month
# cohort 2017-01-01 2017-02-01 2017-03-01
# 2017-01-01 2 1 0
# 2017-02-01 0 1 1
库(tidyverse)
图书馆(lubridate)
交易额%
分组依据(用户id)%>%
顶部n(-1,月)%>%
#第一笔交易的日期
重命名(队列=月份)
使用者
交易%>%
分组依据(月,用户id)%>%
不同的()%>%
左联合(用户,由='user\u id')%>%
xtabs(~队列+月份,数据=)
#月
#队列2017-01-01 2017-02-01 2017-03-01
# 2017-01-01 2 1 0
# 2017-02-01 0 1 1
像库(data.table)这样的东西怎么样;setDT(数据);dcast(数据[,队列:=min(日期),by=user\u id],队列~date)
?但是,如果用户在一个月内重复一个事务,这将起作用(例如,如果user\u id
1在一月份进行了两次交易,那么上面的代码在一月份计算为6。希望这是有意义的,有意义的。如果我答对了,您可以将数据框包装在unique
中。请参阅我的答案。如果您想要一个TIBLE作为输出,您可以使用:``transactions%>%group\u by(月,用户id)%%>%distinct()%%>%left\u join(用户,by='user\u id')%%>%group\u by(队列,月份)%%>%count(队列,月份)%%>%arrange(队列,月份)%%>%pivot\u更宽(name\u from=month,value\u from=n)```
library(tidyverse)
library(lubridate)
transactions <- tibble(
month=ymd(c("2017-01-01", "2017-01-01", "2017-02-01", "2017-02-01", "2017-03-01")),
user_id=c(1, 2, 1, 3, 3)
)
# Jan 1
# Jan 2
# Feb 1
# Feb 3
# Mar 1
# mark the cohort of the users
users <- transactions %>%
arrange(month, user_id) %>%
group_by(user_id) %>%
top_n(-1, month) %>%
# date of the first transaction
rename(cohort = month)
users
transactions %>%
group_by(month, user_id) %>%
distinct() %>%
left_join(users, by = 'user_id') %>%
xtabs(~ cohort + month, data = .)
# month
# cohort 2017-01-01 2017-02-01 2017-03-01
# 2017-01-01 2 1 0
# 2017-02-01 0 1 1