如何在R中按组执行填充操作

如何在R中按组执行填充操作,r,dplyr,tidyr,R,Dplyr,Tidyr,因此,我尝试使用逻辑将列trip添加到数据帧中。逻辑是,如果变量值为1,则表示跳闸已开始,当变量值为0时,跳闸已结束。由于车辆未运行,介于1和0之间的任何其他变量值无效。在我的实际数据中,有超过百万行包含十几个device\u id 样本数据: dput(structure(list(SampleTime = c("2020-12-03 09:05:50", "2020-12-03 09:05:56", "2020-12-03 09:06:01

因此,我尝试使用逻辑将列
trip
添加到数据帧中。逻辑是,如果变量值为1,则表示跳闸已开始,当变量值为0时,跳闸已结束。由于车辆未运行,介于1和0之间的任何其他变量值无效。在我的实际数据中,有超过百万行包含十几个
device\u id

样本数据:

dput(structure(list(SampleTime = c("2020-12-03 09:05:50", "2020-12-03 09:05:56", 
"2020-12-03 09:06:01", "2020-12-03 09:06:07", "2020-12-03 09:06:13", 
"2020-12-03 09:06:18", "2020-12-03 09:06:19", "2020-12-03 09:06:24", 
"2020-12-03 09:06:30", "2020-12-03 09:06:36", "2020-12-03 09:06:42", 
"2020-12-03 09:06:47", "2020-12-03 09:06:53", "2020-12-03 09:06:59", 
"2020-12-03 09:07:04", "2020-12-03 09:07:10", "2020-12-03 09:07:16", 
"2020-12-03 09:07:22", "2020-12-03 09:07:27", "2020-12-03 09:07:33", 
"2020-12-03 09:07:39", "2020-12-03 09:07:45", "2020-12-03 09:07:50", 
"2020-12-03 09:07:56", "2020-12-03 09:08:02", "2020-12-03 09:08:07", 
"2020-12-03 09:08:13", "2020-12-03 09:08:19", "2020-12-03 09:08:25", 
"2020-12-03 09:08:30", "2020-12-03 09:08:36", "2020-12-03 09:08:42", 
"2020-12-03 09:08:48", "2020-12-03 09:08:53", "2020-12-03 09:08:59", 
"2020-12-03 09:09:05", "2020-12-03 09:09:10", "2020-12-03 09:09:16", 
"2020-12-03 09:09:22", "2020-12-03 09:09:28"), Value = c(30, 
35, 1, 40, 1, 7.5, 45, 1500, 30, 1000, 9.1, 10.6, 20, 1000, 0, 
35, 1130, 1, 1580, 0, 45, 1, 45, 60, 30, 1000, 25, 0, 12.1, 50, 
11, 11.5, 0, 12, 1, 30, 1600, 25, 2100, 12.1), Variable = c("A", 
"A", "I", "A", "I", "C", "A", "B", "A", "B", "C", "C", "A", "B", 
"I", "A", "B", "I", "B", "I", "A", "I", "A", "A", "A", "B", "A", 
"I", "C", "A", "C", "C", "I", "C", "I", "A", "B", "A", "B", "C"
), device_id = c("BC", "BC", "BC", "BC", "AB", "BC", "BC", "BC", 
"AB", "AB", "AB", "BC", "BC", "BC", "BC", "AB", "AB", "BC", "BC", 
"AB", "AB", "AB", "AB", "BC", "BC", "BC", "BC", "BC", "BC", "BC", 
"AB", "AB", "AB", "AB", "BC", "BC", "AB", "AB", "AB", "BC")), row.names = c(NA, 
-40L), class = c("tbl_df", "tbl", "data.frame")))
我所尝试的:

library(dplyr)
library(tidyr)
df2 %>% filter(Variable == 'I') %>% group_by(device_id) %>% 
            mutate(trip = case_when(Value == 1 ~ 'ON', TRUE ~ 'OFF')) %>% ungroup() %>%   
            right_join(df2) %>% arrange(SampleTime) %>% fill(trip, .direction = 'down')
Joining, by = c("SampleTime", "Value", "Variable", "device_id")
# A tibble: 40 x 5
   SampleTime           Value Variable device_id trip 
   <chr>                <dbl> <chr>    <chr>     <chr>
 1 2020-12-03 09:05:50   30   A        BC        NA   
 2 2020-12-03 09:05:56   35   A        BC        NA   
 3 2020-12-03 09:06:01    1   I        BC        ON   
 4 2020-12-03 09:06:07   40   A        BC        ON   
 5 2020-12-03 09:06:13    1   I        AB        ON   
 6 2020-12-03 09:06:18    7.5 C        BC        ON   
 7 2020-12-03 09:06:19   45   A        BC        ON   
 8 2020-12-03 09:06:24 1500   B        BC        ON   
 9 2020-12-03 09:06:30   30   A        AB        ON   
10 2020-12-03 09:06:36 1000   B        AB        ON   
11 2020-12-03 09:06:42    9.1 C        AB        ON   
12 2020-12-03 09:06:47   10.6 C        BC        ON   
13 2020-12-03 09:06:53   20   A        BC        ON   
14 2020-12-03 09:06:59 1000   B        BC        ON   
15 2020-12-03 09:07:04    0   I        BC        OFF  
16 2020-12-03 09:07:10   35   A        AB        OFF  
17 2020-12-03 09:07:16 1130   B        AB        OFF  
18 2020-12-03 09:07:22    1   I        BC        ON   
19 2020-12-03 09:07:27 1580   B        BC        ON   
20 2020-12-03 09:07:33    0   I        AB        OFF  
21 2020-12-03 09:07:39   45   A        AB        OFF  
22 2020-12-03 09:07:45    1   I        AB        ON   
23 2020-12-03 09:07:50   45   A        AB        ON   
24 2020-12-03 09:07:56   60   A        BC        ON   
25 2020-12-03 09:08:02   30   A        BC        ON   
26 2020-12-03 09:08:07 1000   B        BC        ON   
27 2020-12-03 09:08:13   25   A        BC        ON   
28 2020-12-03 09:08:19    0   I        BC        OFF  
29 2020-12-03 09:08:25   12.1 C        BC        OFF  
30 2020-12-03 09:08:30   50   A        BC        OFF  
31 2020-12-03 09:08:36   11   C        AB        OFF  
32 2020-12-03 09:08:42   11.5 C        AB        OFF  
33 2020-12-03 09:08:48    0   I        AB        OFF  
34 2020-12-03 09:08:53   12   C        AB        OFF  
35 2020-12-03 09:08:59    1   I        BC        ON   
36 2020-12-03 09:09:05   30   A        BC        ON   
37 2020-12-03 09:09:10 1600   B        AB        ON   
38 2020-12-03 09:09:16   25   A        AB        ON   
39 2020-12-03 09:09:22 2100   B        AB        ON   
40 2020-12-03 09:09:28   12.1 C        BC        ON  
库(dplyr)
图书馆(tidyr)
df2%%>%筛选器(变量=='I')%%>%分组依据(设备id)%%>%
mutate(trip=case_,当(Value==1~'ON',TRUE~'OFF'))%%>%ungroup()%%>%
右键连接(df2)%%>%排列(采样时间)%%>%填充(行程,.方向='down')
连接,通过=c(“采样时间”、“值”、“变量”、“设备id”)
#一个tibble:40x5
采样时间值可变装置\u id跳闸
1 2020-12-03 09:05:50公元前30年北美
2020-12-03 09:05:56公元前35年北美
3 2020-12-03 09:06:01公元前1世纪
4 2020-12-03 09:06:07公元前40年
5 2020-12-03 09:06:13 1我在
6 2020-12-03 09:06:18公元前7.5度
7 2020-12-03 09:06:19公元前45年
8 2020-12-03 09:06:24公元前1500年
9 2020-12-03 09:06:30北纬30度
10 2020-12-03 09:06:36公元前1000年
11 2020-12-03 09:06:42 9.1摄氏度
12 2020-12-03 09:06:47公元前10.6年
13 2020-12-03 09:06:53公元前20年
14 2020-12-03 09:06:59公元前1000年
15 2020-12-03 09:07:04公元前0度关闭
16 2020-12-03 09:07:10 35 A AB关闭
17 2020-12-03 09:07:16 1130 B AB关闭
18 2020-12-03 09:07:22公元前1世纪
19 2020-12-03 09:07:27公元前1580年
20 2020-12-03 09:07:33我要离开
21 2020-12-03 09:07:39 45 A AB关闭
22 2020-12-03 09:07:45 1我在
23 2020-12-03 09:07:50北纬45度
24 2020-12-03 09:07:56公元前60年
25 2020-12-03 09:08:02公元前30年
26 2020-12-03 09:08:07公元前1000年
27 2020-12-03 09:08:13公元前25年
28 2020-12-03 09:08:19西元前0分
29 2020-12-03 09:08:25公元前12.1度关闭
30 2020-12-03 09:08:30北纬50度
31 2020-12-03 09:08:36 11 C AB关闭
32 2020-12-03 09:08:42 11.5摄氏度关闭
33 2020-12-03 09:08:48我要离开
34 2020-12-03 09:08:53 12 C AB关闭
35 2020-12-03 09:08:59公元前1世纪
36 2020-12-03 09:09:05公元前30年
37 2020-12-03 09:09:10公元前1600年
38 2020-12-03 09:09:16 25 A AB接通
39 2020-12-03 09:09:22 2100年B月B日
40 2020-12-03 09:09:28公元前12.1年

如上所述,在第16行和第17行,
设备id
跳闸
值应为
,因为从第5行开始的AB跳闸仍在进行中。AB在第31行和第32行的跳闸情况也是如此。请有人帮我找到正确的代码。

即使我不完全理解您的描述。我的建议如下

library(tidyverse)

res <- df %>% 
  group_by(device_id) %>% 
  arrange(device_id,SampleTime) %>% 
  mutate(trip_pre = cumsum(Value %in% c(0,1))) %>% 
  group_by(trip_pre) %>% 
  mutate(trip = if_else(first(Value == 1), "ON", "OFF")) %>% 
  ungroup()
库(tidyverse)
res%
分组依据(设备id)%>%
排列(设备id,采样时间)%>%
突变(trip_pre=cumsum(值%c(0,1))中的%)%>%
组员(出行前)%>%
突变(跳闸=如果其他(第一个(值==1),“开”,“关”))%>%
解组()

即使我不完全理解您的描述。我的建议如下

library(tidyverse)

res <- df %>% 
  group_by(device_id) %>% 
  arrange(device_id,SampleTime) %>% 
  mutate(trip_pre = cumsum(Value %in% c(0,1))) %>% 
  group_by(trip_pre) %>% 
  mutate(trip = if_else(first(Value == 1), "ON", "OFF")) %>% 
  ungroup()
库(tidyverse)
res%
分组依据(设备id)%>%
排列(设备id,采样时间)%>%
突变(trip_pre=cumsum(值%c(0,1))中的%)%>%
组员(出行前)%>%
突变(跳闸=如果其他(第一个(值==1),“开”,“关”))%>%
解组()
这个策略有效吗

library(tidyverse)
df %>% mutate(rowid = row_number()) %>%
  group_by(device_id) %>% arrange(device_id, SampleTime) %>%
  mutate(dummy = ifelse(Value == 1, 1, ifelse(Value == 0, -1, 0)),
         dummy = cumsum(dummy),
         trip = ifelse(dummy == 1, "ON", "OFF")) %>%
  ungroup() %>% arrange(rowid) %>% select(-rowid, -dummy)

# A tibble: 40 x 5
   SampleTime           Value Variable device_id trip 
   <chr>                <dbl> <chr>    <chr>     <chr>
 1 2020-12-03 09:05:50   30   A        BC        OFF  
 2 2020-12-03 09:05:56   35   A        BC        OFF  
 3 2020-12-03 09:06:01    1   I        BC        ON   
 4 2020-12-03 09:06:07   40   A        BC        ON   
 5 2020-12-03 09:06:13    1   I        AB        ON   
 6 2020-12-03 09:06:18    7.5 C        BC        ON   
 7 2020-12-03 09:06:19   45   A        BC        ON   
 8 2020-12-03 09:06:24 1500   B        BC        ON   
 9 2020-12-03 09:06:30   30   A        AB        ON   
10 2020-12-03 09:06:36 1000   B        AB        ON   
# ... with 30 more rows
库(tidyverse)
df%>%变异(rowid=row_number())%>%
分组依据(设备id)%%>%排列(设备id,采样时间)%%>%
mutate(dummy=ifelse(Value==1,1,ifelse(Value==0,-1,0)),
dummy=总和(dummy),
跳闸=ifelse(虚拟==1,“开”,“关”))%>%
解组()%%>%arrange(rowid)%%>%select(-rowid,-dummy)
#一个tibble:40x5
采样时间值可变装置\u id跳闸
112020-12-03 09:05:50公元前30:30关闭
2 2020-12-03 09:05:56公元前35年A点
3 2020-12-03 09:06:01公元前1世纪
4 2020-12-03 09:06:07公元前40年
5 2020-12-03 09:06:13 1我在
6 2020-12-03 09:06:18公元前7.5度
7 2020-12-03 09:06:19公元前45年
8 2020-12-03 09:06:24公元前1500年
9 2020-12-03 09:06:30北纬30度
10 2020-12-03 09:06:36公元前1000年
# ... 还有30行
这个策略有效吗

library(tidyverse)
df %>% mutate(rowid = row_number()) %>%
  group_by(device_id) %>% arrange(device_id, SampleTime) %>%
  mutate(dummy = ifelse(Value == 1, 1, ifelse(Value == 0, -1, 0)),
         dummy = cumsum(dummy),
         trip = ifelse(dummy == 1, "ON", "OFF")) %>%
  ungroup() %>% arrange(rowid) %>% select(-rowid, -dummy)

# A tibble: 40 x 5
   SampleTime           Value Variable device_id trip 
   <chr>                <dbl> <chr>    <chr>     <chr>
 1 2020-12-03 09:05:50   30   A        BC        OFF  
 2 2020-12-03 09:05:56   35   A        BC        OFF  
 3 2020-12-03 09:06:01    1   I        BC        ON   
 4 2020-12-03 09:06:07   40   A        BC        ON   
 5 2020-12-03 09:06:13    1   I        AB        ON   
 6 2020-12-03 09:06:18    7.5 C        BC        ON   
 7 2020-12-03 09:06:19   45   A        BC        ON   
 8 2020-12-03 09:06:24 1500   B        BC        ON   
 9 2020-12-03 09:06:30   30   A        AB        ON   
10 2020-12-03 09:06:36 1000   B        AB        ON   
# ... with 30 more rows
库(tidyverse)
df%>%变异(rowid=row_number())%>%
分组依据(设备id)%%>%排列(设备id,采样时间)%%>%
mutate(dummy=ifelse(Value==1,1,ifelse(Value==0,-1,0)),
dummy=总和(dummy),
跳闸=ifelse(虚拟==1,“开”,“关”))%>%
解组()%%>%arrange(rowid)%%>%select(-rowid,-dummy)
#一个tibble:40x5
采样时间值可变装置\u id跳闸
112020-12-03 09:05:50公元前30:30关闭
2 2020-12-03 09:05:56公元前35年A点
3 2020-12-03 09:06:01公元前1世纪
4 2020-12-03 09:06:07公元前40年
5 2020-12-03 09:06:13 1我在
6 2020-12-03 09:06:18公元前7.5度