R 将年龄组分为每月一次_R_Dataframe_Bucket_Difftime

R 将年龄组分为每月一次

r dataframe

R 将年龄组分为每月一次,r,dataframe,bucket,difftime,R,Dataframe,Bucket,Difftime,我正在努力找到以下问题的解决方案。我有一个带有id/dob的df和另一个monthbucket df，如下所示种子（33） df不是很复杂，但我加入了两个表（首先在df$id上扩展monthbucket），然后计算年龄（因为你有整个月，我只计算了difftime与出生月份的第一天和startmonth）。然后，对于每个月（桶），我计算不同年龄组的数量，最后将长格式转换为宽格式，以便更好地说明库（lubridate）图书馆（tidyverse）蒙特布克特%>% 分组依据所有%>% 展开（

我正在努力找到以下问题的解决方案。我有一个带有

id/dob的df和另一个monthbucket df，如下所示

种子（33）
df不是很复杂，但我加入了两个表（首先在df$id
上扩展monthbucket
），然后计算年龄（因为你有整个月，我只计算了difftime
与出生月份的第一天和startmonth
）。然后，对于每个月（桶），我计算不同年龄组的数量，最后将长格式转换为宽格式，以便更好地说明
库（lubridate）
图书馆（tidyverse）
蒙特布克特%>%
分组依据所有%>%
展开（id=df$id）%>%
左联合（，{df%>%
突变（出生月=切割月），
by=“id”）%%>%
变异（年龄=时间长度（差异时间（起始月，出生月），“年”）%>%
当（年龄64~“>64”，
正确~“19-64”））%>%
分组单位（月）%>%
计数（年龄组）%>%
聚集（变量，计数，n）%>%
单位（变量，年龄）%>%
排列（变量、计数）
#>#A tibble:13 x 4
#>#组：月[13]
#>月'64``19-64`
#>           
#>  1 2010-01     3     2       5
#>  2 2010-02     3     2       5
#>  3 2010-03     3     2       5
#>  4 2010-04     3     2       5
#>  5 2010-05     3     2       5
#>  6 2010-06     3     2       5
#>  7 2010-07     3     2       5
#>  8 2010-08     3     2       5
#>  9 2010-09     3     2       5
#> 10 2010-10     3     2       5
#> 11 2010-11     3     2       5
#> 12 2010-12     3     2       5
#> 13 2011-01     3     2       5

创建于2019-07-03，由（v0.3.0）
创建，前提是我理解您的请求
ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))

ages <- do.call(data.frame, 
  lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))

ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
ages
#       2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
# 0-19        2       2       2       2       2       2       2       2       2       2       2       2       2
# 19-64       7       7       7       7       7       7       7       7       7       7       7       7       7
# 64+         1       1       1       1       1       1       1       1       1       1       1       1       1
# 

ages与@AkselA的答案有一些相似之处，因为它取决于outer（）
、cut（）
和table（）

dplyr
和tidyr
：
library(dplyr)
library(tidyr)

crossing(month_bucket, birth_days)%>%
  count(month_bucket
        , age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
        )%>%
  spread(age_range, n)

在base中采用了类似的方法，我对此并不完全满意
all_combos <- expand.grid(month_bucket =  month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))

reshape(
  data = aggregate(
    all_combos$month_bucket
    , by = list(bucket = all_combos$month_bucket
                ,age_group = all_combos$cut_r)
    , FUN = length)
  , timevar = 'age_group'
  , idvar = 'bucket'
  , direction = 'wide'
)

data.table
选项的所有组合，它是dcast（…）
而不是cast（…）。此外，我得到的基准与你的基准极为不同，中位数为MM 59，Ansel 3.3，Cole_outer 2.8，Cole_dt 6.5，Cole_dplyr 5.9，Cole_重塑5.3毫秒。这是在microbenchmark中的100次。
library(dplyr)
library(tidyr)

crossing(month_bucket, birth_days)%>%
  count(month_bucket
        , age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
        )%>%
  spread(age_range, n)

all_combos <- expand.grid(month_bucket =  month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))

reshape(
  data = aggregate(
    all_combos$month_bucket
    , by = list(bucket = all_combos$month_bucket
                ,age_group = all_combos$cut_r)
    , FUN = length)
  , timevar = 'age_group'
  , idvar = 'bucket'
  , direction = 'wide'
)