R 基于条件的数据子集的累积和

R 基于条件的数据子集的累积和,r,R,我有我认为是一个简单的R任务,但我有麻烦。基本上,我需要根据另一列的标准进行值的累积和 这里有一个问题,它应该对前面的行进行累积求和,直到达到另一个条件为止。在我提供的示例中,它累加了duration列中的所有值,即condition列中的1和2。示例如下所示 非常感谢您的帮助 duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6) condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2) ac

我有我认为是一个简单的R任务,但我有麻烦。基本上,我需要根据另一列的标准进行值的累积和

这里有一个问题,它应该对前面的行进行累积求和,直到达到另一个条件为止。在我提供的示例中,它累加了duration列中的所有值,即condition列中的1和2。示例如下所示

非常感谢您的帮助

duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)

df <- data.frame(duration,condition,accum_sum)
df
row    duration condition accum_sum
1         2         0         0
2         3         1         5
3         2         0         0
4         4         0         0
5         5         0         0
6        10         0         0
7         2         0         0
8         9         2        32
9         7         0         0
10        5         0         0
11        8         0         0
12        9         0         0
13       10         1        39
14       12         0         0
15        4         0         0
16        5         0         0
17        6         2        27

duration使用
dplyr
,我们可以在
condition
上使用
cumsum()。然后在这些子集中添加:

library(dplyr)

df %>%
    mutate(condition_group = cumsum(lag(condition, default = 0) != 0) + 1) %>%
    group_by(condition_group) %>%
    mutate(accum_sum = ifelse(condition != 0,
                              sum(duration),
                              0))
输出:

# A tibble: 17 x 4
# Groups:   condition_group [4]
   duration condition accum_sum condition_group
      <dbl>     <dbl>     <dbl>           <dbl>
 1        2         0         0               1
 2        3         1         5               1
 3        2         0         0               2
 4        4         0         0               2
 5        5         0         0               2
 6       10         0         0               2
 7        2         0         0               2
 8        9         2        32               2
 9        7         0         0               3
10        5         0         0               3
11        8         0         0               3
12        9         0         0               3
13       10         1        39               3
14       12         0         0               4
15        4         0         0               4
16        5         0         0               4
17        6         2        27               4
#一个tible:17 x 4
#组:条件组[4]
持续时间条件累计和条件组
1        2         0         0               1
2        3         1         5               1
3        2         0         0               2
4        4         0         0               2
5        5         0         0               2
6       10         0         0               2
7        2         0         0               2
8        9         2        32               2
9        7         0         0               3
10        5         0         0               3
11        8         0         0               3
12        9         0         0               3
13       10         1        39               3
14       12         0         0               4
15        4         0         0               4
16        5         0         0               4
17        6         2        27               4
使用数据表:

setDT(df)
df[, accum_sum := cumsum(duration), by = rev(cumsum(rev(condition)))]
df[condition == 0, accum_sum := 0]
#    duration condition accum_sum
# 1:        2         0         0
# 2:        3         1         5
# 3:        2         0         0
# 4:        4         0         0
# 5:        5         0         0
# 6:       10         0         0
# 7:        2         0         0
# 8:        9         2        32
# 9:        7         0         0
#10:        5         0         0
#11:        8         0         0
#12:        9         0         0
#13:       10         1        39
#14:       12         0         0
#15:        4         0         0
#16:        5         0         0
#17:        6         2        27
我们通过使用
rev(cumsum(rev(condition)))向后填充零来创建运行。
然后根据这个“填充”条件分组。

希望这有帮助

#cumulative sum
df$cum_sum <- ave(df$duration, c(0, cumsum(df$condition[-nrow(df)])), FUN = cumsum)

#replace all zero condition row with zero value in cumulative sum column
df$cum_sum <- ifelse(df$condition == 0, 0, df$cum_sum)

样本数据:

df <- structure(list(duration = c(2, 3, 2, 4, 5, 10, 2, 9, 7, 5, 8, 
9, 10, 12, 4, 5, 6), condition = c(0, 1, 0, 0, 0, 0, 0, 2, 0, 
0, 0, 0, 1, 0, 0, 0, 2), cum_sum = c(0, 5, 0, 0, 0, 0, 0, 32, 
0, 0, 0, 0, 39, 0, 0, 0, 27)), .Names = c("duration", "condition", 
"cum_sum"), row.names = c(NA, -17L), class = "data.frame")

df如果将条件移位1,只需使用tapply即可

duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)

df <- data.frame(duration,condition,accum_sum)

df$want <- unlist(tapply(df$duration, 
                         INDEX = cumsum(c(df$condition[1], head(df$condition, -1))), 
                         cumsum)) * ifelse(df$condition == 0, 0, 1)
df

duration双反转非常聪明。
accum\u sum:=sum(duration)
将产生相同的输出。我知道。我还没有将循环利用与cumsum进行对比,但两者的速度应该差不多。
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)

df <- data.frame(duration,condition,accum_sum)

df$want <- unlist(tapply(df$duration, 
                         INDEX = cumsum(c(df$condition[1], head(df$condition, -1))), 
                         cumsum)) * ifelse(df$condition == 0, 0, 1)
df