R 基于条件的数据子集的累积和
我有我认为是一个简单的R任务,但我有麻烦。基本上,我需要根据另一列的标准进行值的累积和 这里有一个问题,它应该对前面的行进行累积求和,直到达到另一个条件为止。在我提供的示例中,它累加了duration列中的所有值,即condition列中的1和2。示例如下所示 非常感谢您的帮助R 基于条件的数据子集的累积和,r,R,我有我认为是一个简单的R任务,但我有麻烦。基本上,我需要根据另一列的标准进行值的累积和 这里有一个问题,它应该对前面的行进行累积求和,直到达到另一个条件为止。在我提供的示例中,它累加了duration列中的所有值,即condition列中的1和2。示例如下所示 非常感谢您的帮助 duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6) condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2) ac
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)
df <- data.frame(duration,condition,accum_sum)
df
row duration condition accum_sum
1 2 0 0
2 3 1 5
3 2 0 0
4 4 0 0
5 5 0 0
6 10 0 0
7 2 0 0
8 9 2 32
9 7 0 0
10 5 0 0
11 8 0 0
12 9 0 0
13 10 1 39
14 12 0 0
15 4 0 0
16 5 0 0
17 6 2 27
duration使用dplyr
,我们可以在condition
上使用cumsum()。然后在这些子集中添加:
library(dplyr)
df %>%
mutate(condition_group = cumsum(lag(condition, default = 0) != 0) + 1) %>%
group_by(condition_group) %>%
mutate(accum_sum = ifelse(condition != 0,
sum(duration),
0))
输出:
# A tibble: 17 x 4
# Groups: condition_group [4]
duration condition accum_sum condition_group
<dbl> <dbl> <dbl> <dbl>
1 2 0 0 1
2 3 1 5 1
3 2 0 0 2
4 4 0 0 2
5 5 0 0 2
6 10 0 0 2
7 2 0 0 2
8 9 2 32 2
9 7 0 0 3
10 5 0 0 3
11 8 0 0 3
12 9 0 0 3
13 10 1 39 3
14 12 0 0 4
15 4 0 0 4
16 5 0 0 4
17 6 2 27 4
#一个tible:17 x 4
#组:条件组[4]
持续时间条件累计和条件组
1 2 0 0 1
2 3 1 5 1
3 2 0 0 2
4 4 0 0 2
5 5 0 0 2
6 10 0 0 2
7 2 0 0 2
8 9 2 32 2
9 7 0 0 3
10 5 0 0 3
11 8 0 0 3
12 9 0 0 3
13 10 1 39 3
14 12 0 0 4
15 4 0 0 4
16 5 0 0 4
17 6 2 27 4
使用数据表:
setDT(df)
df[, accum_sum := cumsum(duration), by = rev(cumsum(rev(condition)))]
df[condition == 0, accum_sum := 0]
# duration condition accum_sum
# 1: 2 0 0
# 2: 3 1 5
# 3: 2 0 0
# 4: 4 0 0
# 5: 5 0 0
# 6: 10 0 0
# 7: 2 0 0
# 8: 9 2 32
# 9: 7 0 0
#10: 5 0 0
#11: 8 0 0
#12: 9 0 0
#13: 10 1 39
#14: 12 0 0
#15: 4 0 0
#16: 5 0 0
#17: 6 2 27
我们通过使用rev(cumsum(rev(condition)))向后填充零来创建运行。
然后根据这个“填充”条件分组。希望这有帮助
#cumulative sum
df$cum_sum <- ave(df$duration, c(0, cumsum(df$condition[-nrow(df)])), FUN = cumsum)
#replace all zero condition row with zero value in cumulative sum column
df$cum_sum <- ifelse(df$condition == 0, 0, df$cum_sum)
样本数据:
df <- structure(list(duration = c(2, 3, 2, 4, 5, 10, 2, 9, 7, 5, 8,
9, 10, 12, 4, 5, 6), condition = c(0, 1, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 1, 0, 0, 0, 2), cum_sum = c(0, 5, 0, 0, 0, 0, 0, 32,
0, 0, 0, 0, 39, 0, 0, 0, 27)), .Names = c("duration", "condition",
"cum_sum"), row.names = c(NA, -17L), class = "data.frame")
df如果将条件移位1,只需使用tapply即可
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)
df <- data.frame(duration,condition,accum_sum)
df$want <- unlist(tapply(df$duration,
INDEX = cumsum(c(df$condition[1], head(df$condition, -1))),
cumsum)) * ifelse(df$condition == 0, 0, 1)
df
duration双反转非常聪明。accum\u sum:=sum(duration)
将产生相同的输出。我知道。我还没有将循环利用与cumsum进行对比,但两者的速度应该差不多。
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)
df <- data.frame(duration,condition,accum_sum)
df$want <- unlist(tapply(df$duration,
INDEX = cumsum(c(df$condition[1], head(df$condition, -1))),
cumsum)) * ifelse(df$condition == 0, 0, 1)
df