基于条件的数据子集的累积总和

时间:2018-07-25 06:53:20

标签: r

我有一个简单的R任务,但遇到了麻烦。基本上,我需要根据另一列的标准对值进行累加。

这是陷阱,它应该对前几行进行累加总和,直到达到另一条件为止。在我提供的示例中,它累积了工期列中条件列中的所有值1和2。示例如下所示。

非常感谢您的帮助

duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)

df <- data.frame(duration,condition,accum_sum)
df
row    duration condition accum_sum
1         2         0         0
2         3         1         5
3         2         0         0
4         4         0         0
5         5         0         0
6        10         0         0
7         2         0         0
8         9         2        32
9         7         0         0
10        5         0         0
11        8         0         0
12        9         0         0
13       10         1        39
14       12         0         0
15        4         0         0
16        5         0         0
17        6         2        27

4 个答案:

答案 0 :(得分:4)

使用data.table:

setDT(df)
df[, accum_sum := cumsum(duration), by = rev(cumsum(rev(condition)))]
df[condition == 0, accum_sum := 0]
#    duration condition accum_sum
# 1:        2         0         0
# 2:        3         1         5
# 3:        2         0         0
# 4:        4         0         0
# 5:        5         0         0
# 6:       10         0         0
# 7:        2         0         0
# 8:        9         2        32
# 9:        7         0         0
#10:        5         0         0
#11:        8         0         0
#12:        9         0         0
#13:       10         1        39
#14:       12         0         0
#15:        4         0         0
#16:        5         0         0
#17:        6         2        27

我们通过向后用rev(cumsum(rev(condition)))填充零来创建运行,然后按这种“填充”条件进行分组。

答案 1 :(得分:2)

希望这会有所帮助!

#cumulative sum
df$cum_sum <- ave(df$duration, c(0, cumsum(df$condition[-nrow(df)])), FUN = cumsum)

#replace all zero condition row with zero value in cumulative sum column
df$cum_sum <- ifelse(df$condition == 0, 0, df$cum_sum)

给出

   duration condition cum_sum
1         2         0       0
2         3         1       5
3         2         0       0
4         4         0       0
5         5         0       0
6        10         0       0
7         2         0       0
8         9         2      32
9         7         0       0
10        5         0       0
11        8         0       0
12        9         0       0
13       10         1      39
14       12         0       0
15        4         0       0
16        5         0       0
17        6         2      27


示例数据

df <- structure(list(duration = c(2, 3, 2, 4, 5, 10, 2, 9, 7, 5, 8, 
9, 10, 12, 4, 5, 6), condition = c(0, 1, 0, 0, 0, 0, 0, 2, 0, 
0, 0, 0, 1, 0, 0, 0, 2), cum_sum = c(0, 5, 0, 0, 0, 0, 0, 32, 
0, 0, 0, 0, 39, 0, 0, 0, 27)), .Names = c("duration", "condition", 
"cum_sum"), row.names = c(NA, -17L), class = "data.frame")

答案 2 :(得分:1)

使用dplyr,我们可以在cumsum()上使用condition来跟踪已经看到多少个条件。然后在这些子集中添加:

library(dplyr)

df %>%
    mutate(condition_group = cumsum(lag(condition, default = 0) != 0) + 1) %>%
    group_by(condition_group) %>%
    mutate(accum_sum = ifelse(condition != 0,
                              sum(duration),
                              0))

输出:

# A tibble: 17 x 4
# Groups:   condition_group [4]
   duration condition accum_sum condition_group
      <dbl>     <dbl>     <dbl>           <dbl>
 1        2         0         0               1
 2        3         1         5               1
 3        2         0         0               2
 4        4         0         0               2
 5        5         0         0               2
 6       10         0         0               2
 7        2         0         0               2
 8        9         2        32               2
 9        7         0         0               3
10        5         0         0               3
11        8         0         0               3
12        9         0         0               3
13       10         1        39               3
14       12         0         0               4
15        4         0         0               4
16        5         0         0               4
17        6         2        27               4

答案 3 :(得分:0)

如果将条件移动1,则只需使用Tapply。

duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)

df <- data.frame(duration,condition,accum_sum)

df$want <- unlist(tapply(df$duration, 
                         INDEX = cumsum(c(df$condition[1], head(df$condition, -1))), 
                         cumsum)) * ifelse(df$condition == 0, 0, 1)
df