如何在R中查找和标记唯一序列的起点/终点_R_Sequences

如何在R中查找和标记唯一序列的起点/终点

如何在R中查找和标记唯一序列的起点/终点,r,sequences,R,Sequences,我在时间向量旁边有一个1和0的序列。我想找到所有1序列的开始和结束时间点，并给每个序列一个唯一的ID。这里是一些示例数据和我迄今为止的尝试创建虚拟数据这是我的努力我相信有更好的方法…这里有一种方法可以找到上述向量的开始和停止位置： # get positions of the 1s onePos <- which(x == 1) # get the ending positions stopPos <- onePos[c(which(diff(onePos) != 1), le

我在时间向量旁边有一个1和0的序列。我想找到所有1序列的开始和结束时间点，并给每个序列一个唯一的ID。这里是一些示例数据和我迄今为止的尝试

创建虚拟数据这是我的努力

我相信有更好的方法…

这里有一种方法可以找到上述向量的开始和停止位置：

# get positions of the 1s
onePos <- which(x == 1)
# get the ending positions
stopPos <- onePos[c(which(diff(onePos) != 1), length(onePos))]
# get the starting positions
startPos <- onePos[c(1, which(diff(onePos) != 1) + 1)]

最后，要添加一个id：

df <- data.frame(id=seq_along(startPos), on_t=t[startPos], off_t=t[stopPos])

以下是一种查找上述向量的开始和停止位置的方法：

# get positions of the 1s
onePos <- which(x == 1)
# get the ending positions
stopPos <- onePos[c(which(diff(onePos) != 1), length(onePos))]
# get the starting positions
startPos <- onePos[c(1, which(diff(onePos) != 1) + 1)]

最后，要添加一个id：

df <- data.frame(id=seq_along(startPos), on_t=t[startPos], off_t=t[stopPos])

你也可以这样做

x = c(0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,0)
# Create the time vector
t = 10:34
xy <- data.frame(x, t)

mr <- rle(xy$x)$lengths
xy$group <- rep(letters[1:length(mr)], times = mr)

onesies <- xy[xy$x == 1, ]
out <- by(onesies, INDICES = onesies$group, 
             FUN = function(x) {
               data.frame(on_t = x$t[1], off_t = x$t[nrow(x)], ID = unique(x$group))
               })

do.call("rbind", out)

  on_t off_t ID
b   14    17  b
d   21    26  d
f   30    33  f

你也可以这样做

x = c(0,0,0,0,1,1,1,1,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,0)
# Create the time vector
t = 10:34
xy <- data.frame(x, t)

mr <- rle(xy$x)$lengths
xy$group <- rep(letters[1:length(mr)], times = mr)

onesies <- xy[xy$x == 1, ]
out <- by(onesies, INDICES = onesies$group, 
             FUN = function(x) {
               data.frame(on_t = x$t[1], off_t = x$t[nrow(x)], ID = unique(x$group))
               })

do.call("rbind", out)

  on_t off_t ID
b   14    17  b
d   21    26  d
f   30    33  f

将两个向量放入data.table中，然后执行典型的分组、筛选和变异转换是另一个选项：

library(data.table)
dt = data.table(seq = x, time = t)
dt[, .(on_t = min(time), off_t = max(time), lab = unique(seq)), .(ID = rleid(seq))]
  # Use rleid to create a unique ID for each sequence as a group by variable, find the start 
  # and end point for each sequence as well as a label for each sequence;
  [lab == 1]
  # filter label so that the result only contains time for sequence of 1
  [, `:=`(lab = NULL, ID = seq_along(ID))][]
  # Remove label and recreate the ID

#    ID on_t off_t
# 1:  1   14    17
# 2:  2   21    26
# 3:  3   30    33

遵循OP的逻辑，哪种可能是更好的方法：

d = diff(c(0, x, 0))
# prepend and append a 0 at the beginning and ending of x to make sure this always work 
# if the sequence starts or ends with 1.
results = data.frame(on_t = t[d == 1], off_t = t[(d == -1)[-1]])
# pick up the time where 1 sequence starts as on time, and 0 starts as off time. Here d is 
# one element longer than t and x but since the last element for d == 1 will always be false, it won't affect the result.
results$ID = 1:nrow(results)
# create an ID

results
#   on_t off_t ID
# 1   14    17  1
# 2   21    26  2
# 3   30    33  3

将两个向量放入data.table中，然后执行典型的分组、筛选和变异转换是另一个选项：

library(data.table)
dt = data.table(seq = x, time = t)
dt[, .(on_t = min(time), off_t = max(time), lab = unique(seq)), .(ID = rleid(seq))]
  # Use rleid to create a unique ID for each sequence as a group by variable, find the start 
  # and end point for each sequence as well as a label for each sequence;
  [lab == 1]
  # filter label so that the result only contains time for sequence of 1
  [, `:=`(lab = NULL, ID = seq_along(ID))][]
  # Remove label and recreate the ID

#    ID on_t off_t
# 1:  1   14    17
# 2:  2   21    26
# 3:  3   30    33

遵循OP的逻辑，哪种可能是更好的方法：

d = diff(c(0, x, 0))
# prepend and append a 0 at the beginning and ending of x to make sure this always work 
# if the sequence starts or ends with 1.
results = data.frame(on_t = t[d == 1], off_t = t[(d == -1)[-1]])
# pick up the time where 1 sequence starts as on time, and 0 starts as off time. Here d is 
# one element longer than t and x but since the last element for d == 1 will always be false, it won't affect the result.
results$ID = 1:nrow(results)
# create an ID

results
#   on_t off_t ID
# 1   14    17  1
# 2   21    26  2
# 3   30    33  3

第二种方法符合我的需要，非常简洁，谢谢！第二种方法符合我的需要，非常简洁，谢谢！