每天观察的累积计数，含tally/dplyr，填充缺失值_R_Dplyr

每天观察的累积计数，含tally/dplyr，填充缺失值

每天观察的累积计数，含tally/dplyr，填充缺失值,r,dplyr,R,Dplyr,我有每个用户的观察列表；每个用户每天可能会观察到几次foo。对于每个不同的日子，我想要一个foo值的累积计数。到目前为止，我得到的是： library(tidyverse) library(lubridate) df = tribble( ~user_id, ~foo, ~bar, ~created_at, 1, "a", "b", "2018-07-30", 1, "a", "c", "2018-07-31", 1, "a", "c", "2018-07-31", 1,

我有每个用户的观察列表；每个用户每天可能会观察到几次

foo

。对于每个不同的日子，我想要一个

foo

值的累积计数。到目前为止，我得到的是：

library(tidyverse)
library(lubridate)

df = tribble(
  ~user_id, ~foo, ~bar, ~created_at,
  1, "a", "b", "2018-07-30",
  1, "a", "c", "2018-07-31",
  1, "a", "c", "2018-07-31",
  1, "b", "a", "2018-08-01",
  1, "b", "c", "2018-08-02",
  1, "b", "a", "2018-08-03",
  1, "a", "a", "2018-08-03",
  2, "b", "b", "2018-07-30",
  2, "b", "c", "2018-07-31",
  2, "a", "a", "2018-08-01",
  2, "a", "a", "2018-08-01",
  2, "a", "c", "2018-08-02",
  2, "a", "c", "2018-08-02",
  2, "a", "a", "2018-08-03"
) %>% mutate_at("created_at", as_datetime)

df %>%
  mutate(cutoff_date = created_at %>% date) %>% 
  group_by(user_id, foo, cutoff_date) %>% 
  tally %>%
  mutate(foo_cnt = cumsum(n)) %>%
  select(-n) %>% 
  arrange(user_id, cutoff_date, foo)

这给了我：

   user_id foo   cutoff_date foo_cnt
     <dbl> <chr> <date>        <int>
 1      1. a     2018-07-30        1
 2      1. a     2018-07-31        3
 3      1. b     2018-08-01        1
 4      1. b     2018-08-02        2
 5      1. a     2018-08-03        4
 6      1. b     2018-08-03        3
 7      2. b     2018-07-30        1
 8      2. b     2018-07-31        2
 9      2. a     2018-08-01        2
10      2. a     2018-08-02        4
11      2. a     2018-08-03        5

在第5行中，这是87.5%，因为在此之前，用户已经看到了

七次和

一次

我知道如何到达那里，但是我正在努力为数据中存在的日期包含

foo

的其他值，但是没有观察到

foo

。我已经研究了

complete（）

，但是我不知道如何使用它来填充剩余的值

例如，当我添加其中任何一列时，都不会得到额外的列：

complete(nesting(user_id, foo), cutoff_date)
complete(user_id, cutoff_date, foo)

我错过了什么

更新：我已经按照建议添加了

ungroup

，现在我也得到了每天的总计数。我已使用

fill

为

foo

的相同值填充以前的值：

df %>%
  mutate(cutoff_date = created_at %>% date) %>% 
  group_by(user_id, foo, cutoff_date) %>%
  tally %>%
  mutate(foo_cnt = cumsum(n)) %>%
  select(-n) %>% 
  ungroup() %>% 
  complete(nesting(user_id, foo), cutoff_date) %>% 
  arrange(user_id, cutoff_date, foo) %>% 
  group_by(user_id, foo) %>% 
  fill(foo_cnt) %>% 
  ungroup() %>% 
  group_by(user_id, cutoff_date) %>% 
  mutate(foo_cnt_total = sum(foo_cnt, na.rm = TRUE))

   user_id foo   cutoff_date foo_cnt foo_cnt_total
     <dbl> <chr> <date>        <int>         <int>
 1      1. a     2018-07-30        1             1
 2      1. a     2018-07-31        3             3
 3      1. a     2018-08-01        3             4
 4      1. a     2018-08-02        3             5
 5      1. a     2018-08-03        4             7
 6      1. b     2018-07-30       NA             1
 7      1. b     2018-07-31       NA             3
 8      1. b     2018-08-01        1             4
 9      1. b     2018-08-02        2             5
10      1. b     2018-08-03        3             7

df%>%
突变（截止日期=创建日期%>%date）%%
分组依据（用户id、foo、截止日期）%>%
计数%>%
突变（foo_cnt=cumsum（n））%>%
选择（-n）%>%
解组（）%>%
完成（嵌套（用户id，foo），截止日期）%>%
安排（用户id、截止日期、foo）%>%
分组依据（用户id，foo）%>%
填充（食物含量）%>%
解组（）%>%
分组依据（用户id、截止日期）%>%
突变（foo_cnt_total=sum（foo_cnt，na.rm=TRUE））
用户id foo截止日期foo cnt foo cnt总计
1      1. a 2018-07-30 1
2      1. a 2018-07-31 3
3      1. a 2018-08-01 3 4
4      1. a 2018-08-02 3 5
5      1. a 2018-08-03 4 7
6      1. b 2018-07-30不适用1
7      1. b 2018-07-31北美3
8      1. b 2018-08-01 14
9      1. b 2018-08-02 2 5
10      1. b 2018-08-03 3 7

但是，

的值不应以

NA

开头。这里需要什么？

您可以指定

complete

-调用中使用的填充，并稍微重新排列不同步骤的顺序，以获得所需的输出：

df %>%
  mutate(cutoff_date = date(created_at)) %>% 
  count(user_id, foo, cutoff_date) %>%
  complete(nesting(user_id, foo), cutoff_date, fill = list(n = 0)) %>% 
  arrange(user_id, foo, cutoff_date) %>% 
  group_by(user_id, foo) %>% 
  mutate(foo_cnt = cumsum(n)) %>%
  group_by(user_id, cutoff_date) %>% 
  mutate(foo_cnt_total = sum(foo_cnt), 
         foo_pct = 100 * foo_cnt / foo_cnt_total) %>% 
  select(-n)

# A tibble: 20 x 6
# Groups:   user_id, cutoff_date [10]
#    user_id foo   cutoff_date foo_cnt foo_cnt_total foo_pct
#      <dbl> <chr> <date>        <dbl>         <dbl>   <dbl>
#  1       1 a     2018-07-30        1             1   100  
#  2       1 a     2018-07-31        3             3   100  
#  3       1 a     2018-08-01        3             4    75  
#  4       1 a     2018-08-02        3             5    60  
#  5       1 a     2018-08-03        4             7    57.1
#  6       1 b     2018-07-30        0             1     0  
#  7       1 b     2018-07-31        0             3     0  
#  8       1 b     2018-08-01        1             4    25  
#  9       1 b     2018-08-02        2             5    40  
# 10       1 b     2018-08-03        3             7    42.9

df%>%
突变（截止日期=日期（创建日期））%>%
计数（用户id、foo、截止日期）%>%
完成（嵌套（用户id，foo），截止日期，填充=列表（n=0））%>%
安排（用户id、foo、截止日期）%>%
分组依据（用户id，foo）%>%
突变（foo_cnt=cumsum（n））%>%
分组依据（用户id、截止日期）%>%
突变（foo_cnt_total=sum（foo_cnt），
foo_pct=100*foo_cnt/foo_cnt_total）%>%
选择（-n）
#一个tibble:20x6
#组：用户id、截止日期[10]
#用户id foo截止日期foo cnt foo cnt foo总foo pct
#                            
#1A 2018-07-30 11100
#21A 2018-07-31 3300
#31A 2018-08-01 3475
#4 1 a 2018-08-02 3 5 60
#51A 2018-08-03 4757.1
#61B 2018-07-30 01 0
#71B 2018-07-31030
#81B 2018-08-01 14 25
#9 1 b 2018-08-02 2 5 40
#101B 2018-08-03 3742.9

结果:

# A tibble: 20 x 6
   user_id foo   cutoff_date foo_cnt foo_cnt_total foo_pct
     <dbl> <chr> <date>        <dbl>         <dbl>   <dbl>
 1       1 a     2018-07-30        1             1   100  
 2       1 b     2018-07-30        0             0     0  
 3       1 a     2018-07-31        3             4   100  
 4       1 b     2018-07-31        0             0     0  
 5       1 a     2018-08-01        0             4    80  
 6       1 b     2018-08-01        1             1    20  
 7       1 a     2018-08-02        0             4    57.1
 8       1 b     2018-08-02        2             3    42.9
 9       1 a     2018-08-03        4             8    57.1
10       1 b     2018-08-03        3             6    42.9
11       2 a     2018-07-30        0             0     0  
12       2 b     2018-07-30        1             1   100  
13       2 a     2018-07-31        0             0     0  
14       2 b     2018-07-31        2             3   100  
15       2 a     2018-08-01        2             2    40  
16       2 b     2018-08-01        0             3    60  
17       2 a     2018-08-02        4             6    66.7
18       2 b     2018-08-02        0             3    33.3
19       2 a     2018-08-03        5            11    78.6
20       2 b     2018-08-03        0             3    21.4

#一个tible:20x6
用户id foo截止日期foo cnt foo cnt foo总foo pct
1A 2018-07-30 11100
21B 2018-07-30 00
31A 2018-07-31 34100
41B 2018-07-31 0 0
51A 2018-08-01 04 80
61B 2018-08-01 1120
71A 2018-08-02 04 57.1
8 1 b 2018-08-02 2 3 42.9
9 1A 2018-08-03 4 8 57.1
101B 2018-08-03 3642.9
11 2 a 2018-07-30 0
12 2 b 2018-07-30 11 100
13 2 a 2018-07-31 0 0
14 2 b 2018-07-31 23 100
15 2 a 2018-08-01 22 40
162B 2018-08-01 03 60
172A 2018-08-02466.7
18 2 b 2018-08-02 0 3 33.3
192A 2018-08-03 51178.6
20 2 b 2018-08-03 0 3 21.4

在

完成之前使用解组（）
。

@kath谢谢，这帮了大忙！我添加了所需的

fill

，用于结转前几天的值，我几乎达到了我需要的位置。但是，在尚未看到

的前几天，我得到了

NA

值（参见更新的问题）。你知道吗？嗯，当我在数据上运行这个时，我得到了不同的输出–在你的代码中，

foo\u cnt\u total

最后只计算一次，而不是按日期分组（或累计）。您确定您的代码和输出匹配吗？谢谢！添加

mutate（foo\u pct=100*foo\u cnt/foo\u cnt\u total）

将给出百分比。此外，

complete

调用可以进行

嵌套（user\u id，foo）

——否则，如果我只想知道每个用户的

foo

实例，结果将是多余的。在我的数据集中这样做使我从3500万行减少到500000行。现在我明白了，为什么你在你的问题中有这个。。。因为在您的示例中，两个用户都有相同的foo，所以我不关心这一点……是的，我知道

df = tribble(
    ~user_id, ~foo, ~bar, ~created_at,
    1, "a", "b", "2018-07-30",
    1, "a", "c", "2018-07-31",
    1, "a", "c", "2018-07-31",
    1, "b", "a", "2018-08-01",
    1, "b", "c", "2018-08-02",
    1, "b", "a", "2018-08-03",
    1, "a", "a", "2018-08-03",
    2, "b", "b", "2018-07-30",
    2, "b", "c", "2018-07-31",
    2, "a", "a", "2018-08-01",
    2, "a", "a", "2018-08-01",
    2, "a", "c", "2018-08-02",
    2, "a", "c", "2018-08-02",
    2, "a", "a", "2018-08-03"
) %>% mutate_at("created_at", as_datetime)

df %>%
    dplyr::mutate(cutoff_date = created_at %>% date) %>% 
    group_by(user_id, foo, cutoff_date) %>% 
    tally %>%
    dplyr::mutate(foo_cnt = cumsum(n)) %>%
    select(-n) %>% 
    arrange(user_id, cutoff_date, foo) %>% group_by(user_id) %>%
    complete(nesting(user_id, foo), cutoff_date, fill = list(foo_cnt = 0)) %>%
    arrange(user_id, cutoff_date, foo) %>% group_by(user_id, foo) %>%
    dplyr::mutate(foo_cnt_total = cumsum(foo_cnt)) %>% group_by(user_id, cutoff_date) %>%
    dplyr::mutate(foo_sum_del = sum(foo_cnt_total)) %>% group_by(user_id, foo, cutoff_date) %>%
    dplyr::mutate(foo_pct = foo_cnt_total/foo_sum_del*100) %>% ungroup() %>%
    select(-foo_sum_del)

# A tibble: 20 x 6
   user_id foo   cutoff_date foo_cnt foo_cnt_total foo_pct
     <dbl> <chr> <date>        <dbl>         <dbl>   <dbl>
 1       1 a     2018-07-30        1             1   100  
 2       1 b     2018-07-30        0             0     0  
 3       1 a     2018-07-31        3             4   100  
 4       1 b     2018-07-31        0             0     0  
 5       1 a     2018-08-01        0             4    80  
 6       1 b     2018-08-01        1             1    20  
 7       1 a     2018-08-02        0             4    57.1
 8       1 b     2018-08-02        2             3    42.9
 9       1 a     2018-08-03        4             8    57.1
10       1 b     2018-08-03        3             6    42.9
11       2 a     2018-07-30        0             0     0  
12       2 b     2018-07-30        1             1   100  
13       2 a     2018-07-31        0             0     0  
14       2 b     2018-07-31        2             3   100  
15       2 a     2018-08-01        2             2    40  
16       2 b     2018-08-01        0             3    60  
17       2 a     2018-08-02        4             6    66.7
18       2 b     2018-08-02        0             3    33.3
19       2 a     2018-08-03        5            11    78.6
20       2 b     2018-08-03        0             3    21.4