在R中随时间跟踪队列_R_Dplyr_Time Series_Zoo

在R中随时间跟踪队列

在R中随时间跟踪队列,r,dplyr,time-series,zoo,R,Dplyr,Time Series,Zoo,我有一个用户ID和交易发生月份的示例数据集。我的目标是按月计算有多少原始用户进行了交易。换句话说，1月份有多少新用户在2月、3月和4月也进行了交易。2月份有多少新用户在3月和4月进行了交易，依此类推 > data date user_id 1 Jan 2017 1 2 Jan 2017 2 3 Jan 2017 3 4 Jan 2017 4 5 Jan 2017 5 6 Feb 2017 1

我有一个用户ID和交易发生月份的示例数据集。我的目标是按月计算有多少原始用户进行了交易。换句话说，1月份有多少新用户在2月、3月和4月也进行了交易。2月份有多少新用户在3月和4月进行了交易，依此类推

> data
       date user_id
1  Jan 2017       1
2  Jan 2017       2
3  Jan 2017       3
4  Jan 2017       4
5  Jan 2017       5
6  Feb 2017       1
7  Feb 2017       3
8  Feb 2017       5
9  Feb 2017       7
10 Feb 2017       9
11 Mar 2017       2
12 Mar 2017       4
13 Mar 2017       6
14 Mar 2017       8
15 Mar 2017      10
16 Apr 2017       1
17 Apr 2017       3
18 Apr 2017       6
19 Apr 2017       9
20 Apr 2017      12

此数据集的输出如下所示：

> output
    Jan Feb Mar Apr
Jan   5   3   2   2
Feb  NA   2   0   1
Mar  NA  NA   3   1
Apr  NA  NA  NA   1

到目前为止，我能想到的唯一方法是分割数据集，然后计算每个月的唯一ID，这些ID在前几个月中不存在，但是这种方法很冗长，不适合有很多个月的大型数据集

subsets <-split(data, data$date, drop=TRUE)

for (i in 1:length(subsets)) {
  assign(paste0("M", i), as.data.frame(subsets[[i]]))
}

M1_ids <- unique(M1$user_id)
M2_ids <- unique(M2$user_id)
M3_ids <- unique(M3$user_id)
M4_ids <- unique(M4$user_id)


M2_ids <- unique(setdiff(M2_ids, unique(M1_ids)))
M3_ids <- unique(setdiff(M3_ids, unique(c(M2_ids, M1_ids))))
M4_ids <- unique(setdiff(M4_ids, unique(c(M3_ids, M2_ids, M1_ids))))

以及样本数据：

> dput(data)
structure(list(date = structure(c(2017, 2017, 2017, 2017, 2017, 
2017.08333333333, 2017.08333333333, 2017.08333333333, 2017.08333333333, 
2017.08333333333, 2017.16666666667, 2017.16666666667, 2017.16666666667, 
2017.16666666667, 2017.16666666667, 2017.25, 2017.25, 2017.25, 
2017.25, 2017.25), class = "yearmon"), user_id = c(1L, 2L, 3L, 
4L, 5L, 1L, 3L, 5L, 7L, 9L, 2L, 4L, 6L, 8L, 10L, 1L, 3L, 6L, 
9L, 12L)), .Names = c("date", "user_id"), row.names = c(NA, -20L
), class = "data.frame")

下面是一个例子：

library(data.table)
library(zoo)
data <- structure(list(date = structure(c(2017, 2017, 2017, 2017, 2017, 
2017.08333333333, 2017.08333333333, 2017.08333333333, 2017.08333333333, 
2017.08333333333, 2017.16666666667, 2017.16666666667, 2017.16666666667, 
2017.16666666667, 2017.16666666667, 2017.25, 2017.25, 2017.25, 
2017.25, 2017.25), class = "yearmon"), user_id = c(1L, 2L, 3L, 
4L, 5L, 1L, 3L, 5L, 7L, 9L, 2L, 4L, 6L, 8L, 10L, 1L, 3L, 6L, 
9L, 12L)), .Names = c("date", "user_id"), row.names = c(NA, -20L
), class = "data.frame")
data <- data[c(1,1:nrow(data)),]
setDT(data)
(cohorts <- dcast(unique(data)[,cohort:=min(date),by=user_id],cohort~date))
#      cohort Jan 2017 Feb 2017 Mrz 2017 Apr 2017
# 1: Jan 2017        5        3        2        2
# 2: Feb 2017        0        2        0        1
# 3: Mrz 2017        0        0        3        1
# 4: Apr 2017        0        0        0        1

m <- as.matrix(cohorts[,-1])
rownames(m) <- cohorts[[1]]
m[lower.tri(m)] <- NA
names(dimnames(m)) <- c("cohort", "yearmon") 
m
#           yearmon
# cohort     Jan 2017 Feb 2017 Mrz 2017 Apr 2017
#   Jan 2017        5        3        2        2
#   Feb 2017       NA        2        0        1
#   Mrz 2017       NA       NA        3        1
#   Apr 2017       NA       NA       NA        1

库（data.table）
图书馆（动物园）
数据这在Tidyverse函数中也是可能的：
library(tidyverse)
library(lubridate)

transactions <- tibble(
  month=ymd(c("2017-01-01", "2017-01-01", "2017-02-01", "2017-02-01", "2017-03-01")),
  user_id=c(1, 2, 1, 3, 3)
)
#  Jan  1
#  Jan  2
#  Feb  1
#  Feb  3
#  Mar  1

# mark the cohort of the users
users <- transactions %>%
  arrange(month, user_id) %>%
  group_by(user_id) %>%
  top_n(-1, month) %>%
  # date of the first transaction
  rename(cohort = month)
users

transactions %>%
  group_by(month, user_id) %>%
  distinct() %>%
  left_join(users, by = 'user_id') %>%
  xtabs(~ cohort + month, data = .)
#            month
# cohort     2017-01-01 2017-02-01 2017-03-01
# 2017-01-01          2          1          0
# 2017-02-01          0          1          1

库（tidyverse）
图书馆（lubridate）
交易额%
分组依据（用户id）%>%
顶部n（-1，月）%>%
#第一笔交易的日期
重命名（队列=月份）
使用者
交易%>%
分组依据（月，用户id）%>%
不同的（）%>%
左联合（用户，由='user\u id'）%>%
xtabs（~队列+月份，数据=）
#月
#队列2017-01-01 2017-02-01 2017-03-01
# 2017-01-01          2          1          0
# 2017-02-01          0          1          1
像库（data.table）这样的东西怎么样；setDT（数据）；dcast（数据[，队列：=min（日期），by=user\u id]，队列~date）
？但是，如果用户在一个月内重复一个事务，这将起作用（例如，如果user\u id
1在一月份进行了两次交易，那么上面的代码在一月份计算为6。希望这是有意义的，有意义的。如果我答对了，您可以将数据框包装在unique中。请参阅我的答案。如果您想要一个TIBLE作为输出，您可以使用：``transactions%>%group\u by（月，用户id）%%>%distinct（）%%>%left\u join（用户，by='user\u id'）%%>%group\u by（队列，月份）%%>%count（队列，月份）%%>%arrange（队列，月份）%%>%pivot\u更宽（name\u from=month，value\u from=n）```
library(tidyverse)
library(lubridate)

transactions <- tibble(
  month=ymd(c("2017-01-01", "2017-01-01", "2017-02-01", "2017-02-01", "2017-03-01")),
  user_id=c(1, 2, 1, 3, 3)
)
#  Jan  1
#  Jan  2
#  Feb  1
#  Feb  3
#  Mar  1

# mark the cohort of the users
users <- transactions %>%
  arrange(month, user_id) %>%
  group_by(user_id) %>%
  top_n(-1, month) %>%
  # date of the first transaction
  rename(cohort = month)
users

transactions %>%
  group_by(month, user_id) %>%
  distinct() %>%
  left_join(users, by = 'user_id') %>%
  xtabs(~ cohort + month, data = .)
#            month
# cohort     2017-01-01 2017-02-01 2017-03-01
# 2017-01-01          2          1          0
# 2017-02-01          0          1          1