R 如果满足多个条件,则具有重置选项的累积和
如果满足多个条件,我将尝试使用重置选项进行累计和。更具体地说,我想将变量R 如果满足多个条件,则具有重置选项的累积和,r,dplyr,R,Dplyr,如果满足多个条件,我将尝试使用重置选项进行累计和。更具体地说,我想将变量amount和count累加起来,按id分组,如果满足这两个条件,则重新设置/从0开始:amount>=10和count=3。我还想创建一个新列,如果满足这些条件,则包含1,否则包含0 数据样本: df <- data.frame( date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-01", &
amount
和count
累加起来,按id
分组,如果满足这两个条件,则重新设置/从0开始:amount
>=10和count
=3。我还想创建一个新列,如果满足这些条件,则包含1,否则包含0
数据样本:
df <- data.frame(
date = as.Date(c("2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01", "2020-01-01", "2020-02-01", "2020-03-01", "2020-04-01", "2020-05-01", "2020-06-01")),
id = c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "C", "C", "C", "C", "C", "C"),
amount = c(1, 9, 5, 5, 6, 2, 10, 4, 8, 10, 6, 5, 5, 1, 6, 5, 5, 5),
count = c(0, 2, 5, 4, 5, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 2, 1, 0)
)
或者,或者:
df %>% group_by(id) %>%
mutate(
amount_cumsum = purrr::accumulate(.x = amount, .f = ~ case_when(.x < 10 ~ .x + .y, TRUE ~ .y)),
count_cumsum = purrr::accumulate(.x = count, .f = ~ case_when(.x < 3 ~ .x + .y, TRUE ~ .y)),
condition_met = as.integer(amount_cumsum >= 10 & count_cumsum >= 3)
)
df%>%group\u by(id)%>%
变异(
金额_cumsum=purrr::累计(.x=amount,.f=~case_当(.x<10~.x+.y,TRUE~.y)),
count_cumsum=purrr::累计(.x=count,.f=~case_当(.x<3~.x+.y,TRUE~.y)),
条件满足=整数(金额总和>=10,计数总和>=3)
)
如果一个变量满足条件,上面的答案将重置累积和,但不考虑是否满足其他条件。我没有解决方案,但您可以从查看
mess::cumsumbining
函数开始,该函数或多或少就是您要找的。问题是只接受一个条件,我不知道如何将金额和计数条件汇总成一个条件
例如,如果您只查找count>=3
,则可以执行以下操作:
df %>%
group_by(id,group=cumsumbinning(count,3)) %>%
mutate(count_cumsum=cumsum(count))
# A tibble: 18 x 6
# Groups: id, group [10]
date id amount count group count_cumsum
<date> <fct> <dbl> <dbl> <int> <dbl>
1 2020-01-01 A 1 1 1 1
2 2020-02-01 A 9 3 2 3
3 2020-03-01 A 5 1 3 1
4 2020-04-01 A 5 1 3 2
5 2020-05-01 A 6 4 4 4
6 2020-06-01 A 2 1 5 1
7 2020-01-01 B 10 0 5 0
8 2020-02-01 B 4 0 5 0
9 2020-03-01 B 8 0 5 0
10 2020-04-01 B 10 0 5 0
11 2020-05-01 B 6 2 5 2
12 2020-06-01 B 5 1 6 1
13 2020-01-01 C 5 1 6 1
14 2020-02-01 C 1 1 6 2
15 2020-03-01 C 6 1 7 1
16 2020-04-01 C 5 2 7 3
17 2020-05-01 C 5 1 8 1
18 2020-06-01 C 5 0 8 1
df%>%
分组依据(id,group=Cumsumbining(计数,3))%>%
变异(count_cumsum=cumsum(count))
#A tibble:18x6
#组:id,组[10]
日期id金额计数组计数总和
1202-01-01A1
2 2020-02-01 A 9 3 2 3
3 2020-03-01 A 5 1 3 1
4 2020-04-01 A 5 1 3 2
52020-05-01A4
6 2020-06-01 A 2 1 5 1
7 2020-01-01 B 10 0 5 0
8 2020-02-01 B 4 0 5 0
9 2020-03-01 B 80 5 0
102020-04-01B1050
11 2020-05-01 B 6 2 5 2
12 2020-06-01 B 5 1 6 1
13 2020-01-01 C 5 1 6 1
14 2020-02-01 C1 1 6 2
15 2020-03-01 C 6 1 7 1
16 2020-04-01 C 5 2 7 3
172020-05-01C5181
182020-06-01C5081
事实上,你所要求的更为困难,因为你希望在达到极限后进行重置
我知道这只是局部的,但我希望它能帮助你 我终于明白了。帮我解决了这个问题
df <- df %>%
group_by(id) %>%
nest(data = c(amount, count)) %>%
mutate(
data_accumulate = purrr::accumulate(.x = data, .f = function(.x, .y) if (max(.x[1]) < 10 | max(.x[2]) < 3) .x + .y else .y)
) %>%
unnest(cols = c(data_accumulate)) %>%
rename(amount_cumsum = amount, count_cumsum = count) %>%
unnest(cols = c(data)) %>%
mutate(condition_met = case_when(
amount_cumsum >= 10 & count_cumsum >= 3 ~ 1,
TRUE ~ 0)
)
df%
分组依据(id)%>%
嵌套(数据=c(数量、计数))%>%
变异(
如果(max(.x[1])<10 | max(.x[2])<3.x+.y else.y),则data_acculate=purrr::acculate(.x=data,.f=function(.x,.y)
) %>%
unnest(cols=c(数据_累计))%>%
重命名(金额总和=金额,计数总和=计数)%>%
unnest(cols=c(数据))%>%
变异(条件满足=情况满足时)(
金额>=10,计数>=3~1,
对(0)
)
为base-R解决方案做出贡献:
df$amount_cumsum <- 0
df$count_cumsum <- 0
df$condition_met <- 0
reset = F
for (i in 1:nrow(df)) {
if (i == 1 | reset) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else if (df$id[i] != df$id[i-1]) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else {
df$amount_cumsum[i] = df$amount_cumsum[i-1] + df$amount[i]
df$count_cumsum[i] = df$count_cumsum[i-1] + df$count[i]
}
if (df$amount_cumsum[i] >= 10 & df$count_cumsum[i] >= 3) {
df$condition_met[i] = 1
reset = T
}
}
嗨,谢谢你的回答!我刚刚用作者删除的答案更新了我的问题。这个答案几乎解决了我的问题,与您的方法类似,但使用了
purr
包。是的,这个答案与我的问题相同,因为purr::accumulate
不能(或者我不知道如何)使用多种条件。感谢您的回复-事实上,您的base
解决方案比dplyr
解决方案快得多,但对于更大的数据集(+200万个观察值和+700.000个唯一组/id),不幸的是它不起作用:dplyr
解决方案花了13,55分钟进行计算,而base
溶液即使在1,81小时后也没有完成计算。我已将你的答案标记为正确,因为我已在较小的样本上进行了测试,结果有效。谢谢
df <- df %>%
group_by(id) %>%
nest(data = c(amount, count)) %>%
mutate(
data_accumulate = purrr::accumulate(.x = data, .f = function(.x, .y) if (max(.x[1]) < 10 | max(.x[2]) < 3) .x + .y else .y)
) %>%
unnest(cols = c(data_accumulate)) %>%
rename(amount_cumsum = amount, count_cumsum = count) %>%
unnest(cols = c(data)) %>%
mutate(condition_met = case_when(
amount_cumsum >= 10 & count_cumsum >= 3 ~ 1,
TRUE ~ 0)
)
df$amount_cumsum <- 0
df$count_cumsum <- 0
df$condition_met <- 0
reset = F
for (i in 1:nrow(df)) {
if (i == 1 | reset) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else if (df$id[i] != df$id[i-1]) {
df$amount_cumsum[i] = df$amount[i]
df$count_cumsum[i] = df$count[i]
reset = F
} else {
df$amount_cumsum[i] = df$amount_cumsum[i-1] + df$amount[i]
df$count_cumsum[i] = df$count_cumsum[i-1] + df$count[i]
}
if (df$amount_cumsum[i] >= 10 & df$count_cumsum[i] >= 3) {
df$condition_met[i] = 1
reset = T
}
}
library(tidyverse)
dates = seq(as.Date("2019-01-01"), as.Date("2020-03-04"), by="days")
df <- data.frame(
date = c(sample(dates, 300), sample(dates, 400), sample(dates, 350)),
id = c(rep("A", 300), rep("B", 400), rep("C", 350)),
amount = floor(runif(1050, 0, 15)),
count = floor(runif(1050, 0, 5)),
stringsAsFactors = F
)
rbenchmark::benchmark(
"Tidy Solution" = {
df_tidy <- df %>%
group_by(id) %>%
nest(data = c(amount, count)) %>%
mutate(
data_accumulate = purrr::accumulate(.x = data, .f = function(.x, .y) if (max(.x[1]) < 10 | max(.x[2]) < 3) .x + .y else .y)
) %>%
unnest(cols = c(data_accumulate)) %>%
rename(amount_cumsum = amount, count_cumsum = count) %>%
unnest(cols = c(data)) %>%
mutate(condition_met = case_when(
amount_cumsum >= 10 & count_cumsum >= 3 ~ 1,
TRUE ~ 0)
)
},
"Base-R Solution" = {
df_base <- df
df_base$amount_cumsum <- 0
df_base$count_cumsum <- 0
df_base$condition_met <- 0
reset = F # to reset the counters
for (i in 1:nrow(df_base)) {
if (i == 1 | reset) {
df_base$amount_cumsum[i] = df_base$amount[i]
df_base$count_cumsum[i] = df_base$count[i]
reset = F
} else if (df_base$id[i] != df_base$id[i-1]) {
df_base$amount_cumsum[i] = df_base$amount[i]
df_base$count_cumsum[i] = df_base$count[i]
reset = F
} else {
df_base$amount_cumsum[i] = df_base$amount_cumsum[i-1] + df_base$amount[i]
df_base$count_cumsum[i] = df_base$count_cumsum[i-1] + df_base$count[i]
}
if (df_base$amount_cumsum[i] >= 10 & df_base$count_cumsum[i] >= 3) {
df_base$condition_met[i] = 1
reset = T
}
}
},
replications = 100)
gc()
test replications elapsed relative user.self sys.self user.child sys.child
Base-R Solution 100 3.89 1.000 3.69 0.0 NA NA
Tidy Solution 100 84.00 21.594 78.65 0.2 NA NA