R数据表。在具有分组依据和条件的移动窗口上应用函数
我有数据表DT,带有时间序列、标志和不同的分组表(类别、品牌、大小等) 如果此窗口中的所有标志都为FALSE,则我希望在按日期排序的大小为4(可配置)的移动窗口中应用按类别分组的min函数(可配置),然后我需要在此窗口中找到最小值并将其标记为TRUE。 data.table的方法是什么,不使用大量for循环 原始表格R数据表。在具有分组依据和条件的移动窗口上应用函数,r,dataframe,data.table,R,Dataframe,Data.table,我有数据表DT,带有时间序列、标志和不同的分组表(类别、品牌、大小等) 如果此窗口中的所有标志都为FALSE,则我希望在按日期排序的大小为4(可配置)的移动窗口中应用按类别分组的min函数(可配置),然后我需要在此窗口中找到最小值并将其标记为TRUE。 data.table的方法是什么,不使用大量for循环 原始表格 zz <- "index date value category flag 1 01.01.2018 20 green FALSE 2
zz <- "index date value category flag
1 01.01.2018 20 green FALSE
2 01.01.2018 8 RED FALSE
3 02.01.2018 21 green FALSE
4 02.01.2018 5 RED FALSE
5 03.01.2018 19 green FALSE
6 03.01.2018 5 RED TRUE
7 04.01.2018 17 green FALSE
8 04.01.2018 7 RED FALSE
9 05.01.2018 19 green FALSE
10 05.01.2018 8 RED FALSE
11 06.01.2018 18 green FALSE
12 06.01.2018 8 RED FALSE
13 07.01.2018 17 green FALSE
14 07.01.2018 8 RED FALSE
15 08.01.2018 16 green TRUE
16 08.01.2018 4 RED TRUE
17 09.01.2018 15 green TRUE
18 09.01.2018 4 RED FALSE
19 10.01.2018 14 green TRUE
20 10.01.2018 6 RED FALSE
21 11.01.2018 13 green TRUE
22 11.01.2018 8 RED FALSE
23 12.01.2018 14 green FALSE
24 12.01.2018 9 RED FALSE
25 13.01.2018 13 green TRUE
26 13.01.2018 5 RED TRUE
27 14.01.2018 14 green FALSE
28 14.01.2018 6 RED FALSE
29 15.01.2018 12 green TRUE
30 15.01.2018 4 RED FALSE
31 16.01.2018 14 green FALSE
32 16.01.2018 4 RED TRUE
33 17.01.2018 13 green TRUE
34 17.01.2018 2 RED TRUE"
Data <- read.table(text=zz, header = TRUE)
我不认为您需要遍历所有行,实际上我相信遍历windows大小序列就足够了
library(data.table)
Data <- read.table(text=zz, header = TRUE)
setDT(Data) #convert to data.table by reference
Data.list <- split(Data, by = "category") #split into list of 2 data.tables by the category variable
windowSize <- 4
Data.list <- lapply(Data.list, function(i) {
for (k in seq_len(windowSize)) { #loop over the window size sequence
setorder(i, date) #make sure data is ordered by date
groups <- 1:ceiling((nrow(i) - (k-1))/windowSize)
repeats <- c(rep(windowSize, each = floor((nrow(i) - (k-1))/windowSize)), (nrow(i) - (k-1)) %% windowSize)
repeats <- repeats[repeats !=0]
i[, window := NA_integer_]
i[k:nrow(i), window := rep(groups, times = repeats)] #createing the grouping dummy variable
i[, check := !any(flag), by = window] #check for interuption in the windows sequence
i[i[, .I[value == min(value)], by = .(check, window)][!is.na(window) & check][1]$V1, flag2 := TRUE]
}
return(i)
})
Data <- rbindlist(Data.list)
Data[, c("window", "check") := NULL]
setorder(Data, date)
您可能希望以表格形式提供数据。您能提供一些可复制的数据吗?使用dput()@RomanLuštrik。我做了修改,使它更容易过关
library(data.table)
Data <- read.table(text=zz, header = TRUE)
setDT(Data) #convert to data.table by reference
Data.list <- split(Data, by = "category") #split into list of 2 data.tables by the category variable
windowSize <- 4
Data.list <- lapply(Data.list, function(i) {
for (k in seq_len(windowSize)) { #loop over the window size sequence
setorder(i, date) #make sure data is ordered by date
groups <- 1:ceiling((nrow(i) - (k-1))/windowSize)
repeats <- c(rep(windowSize, each = floor((nrow(i) - (k-1))/windowSize)), (nrow(i) - (k-1)) %% windowSize)
repeats <- repeats[repeats !=0]
i[, window := NA_integer_]
i[k:nrow(i), window := rep(groups, times = repeats)] #createing the grouping dummy variable
i[, check := !any(flag), by = window] #check for interuption in the windows sequence
i[i[, .I[value == min(value)], by = .(check, window)][!is.na(window) & check][1]$V1, flag2 := TRUE]
}
return(i)
})
Data <- rbindlist(Data.list)
Data[, c("window", "check") := NULL]
setorder(Data, date)
# index date value category flag flag2
#1: 1 01.01.2018 20 green FALSE NA
#2: 2 01.01.2018 8 RED FALSE NA
#3: 3 02.01.2018 21 green FALSE NA
#4: 4 02.01.2018 5 RED FALSE NA
#5: 5 03.01.2018 19 green FALSE NA
#6: 6 03.01.2018 5 RED TRUE NA
#7: 7 04.01.2018 17 green FALSE TRUE
#8: 8 04.01.2018 7 RED FALSE TRUE
#9: 9 05.01.2018 19 green FALSE NA
#10: 10 05.01.2018 8 RED FALSE NA
#11: 11 06.01.2018 18 green FALSE NA
#12: 12 06.01.2018 8 RED FALSE NA
#13: 13 07.01.2018 17 green FALSE NA
#14: 14 07.01.2018 8 RED FALSE NA
#15: 15 08.01.2018 16 green TRUE NA
#16: 16 08.01.2018 4 RED TRUE NA
#17: 17 09.01.2018 15 green TRUE NA
#18: 18 09.01.2018 4 RED FALSE TRUE
#19: 19 10.01.2018 14 green TRUE NA
#20: 20 10.01.2018 6 RED FALSE NA
#21: 21 11.01.2018 13 green TRUE NA
#22: 22 11.01.2018 8 RED FALSE NA
#23: 23 12.01.2018 14 green FALSE NA
#24: 24 12.01.2018 9 RED FALSE NA
#25: 25 13.01.2018 13 green TRUE NA
#26: 26 13.01.2018 5 RED TRUE NA
#27: 27 14.01.2018 14 green FALSE NA
#28: 28 14.01.2018 6 RED FALSE NA
#29: 29 15.01.2018 12 green TRUE NA
#30: 30 15.01.2018 4 RED FALSE NA
#31: 31 16.01.2018 14 green FALSE NA
#32: 32 16.01.2018 4 RED TRUE NA
#33: 33 17.01.2018 13 green TRUE NA
#34: 34 17.01.2018 2 RED TRUE NA
# index date value category flag flag2