根据R中的行号重新构造数据

根据R中的行号重新构造数据,r,dplyr,data.table,tidyverse,tidyr,R,Dplyr,Data.table,Tidyverse,Tidyr,我无法按需要重新构造数据。 我的df看起来像这样: id <- (1:20) author <- c("A","A","A","A","A","B","B","B","A","A","A","B","B","B&quo

我无法按需要重新构造数据。 我的df看起来像这样:

id <- (1:20)
author <- c("A","A","A","A","A","B","B","B","A","A","A","B","B","B","B"
            ,"B","B","B","A","A")
df <- data.frame(id, author)

> print(df)

   id author
1   1      A
2   2      A
3   3      A
4   4      A
5   5      A
6   6      B
7   7      B
8   8      B
9   9      A
10 10      A
11 11      A
12 12      B
13 13      B
14 14      B
15 15      B
16 16      B
17 17      B
18 18      B
19 19      A
20 20      A

A <- c(1, 5, 9, 11, 19,20)
B <- c(6, 8, 12, 18, NA, NA)
df.desired <- data.frame(A, B)
print(df.desired)
   A  B
1  1  6
2  5  8
3  9 12
4 11 18
5 19 NA
6 20 NA 

id我们可以使用
数据创建组。table
rleid
,选择每个组中的第一行和最后一行,并以宽格式获取数据

library(dplyr)

df %>%
  group_by(grp = data.table::rleid(author)) %>%
  slice(1L, n()) %>%
  group_by(author) %>%
  mutate(grp = row_number()) %>%
  tidyr::pivot_wider(names_from = author, values_from = id) %>%
  select(-grp)

# A tibble: 6 x 2
#      A     B
#  <int> <int>
#1     1     6
#2     5     8
#3     9    12
#4    11    18
#5    19    NA
#6    20    NA
库(dplyr)
df%>%
分组依据(grp=data.table::rleid(作者))%>%
切片(1L,n())%>%
组别(作者)%>%
变异(grp=行数())%>%
tidyr::pivot\u更宽(name\u from=author,value\u from=id)%>%
选择(-grp)
#一个tibble:6x2
#A B
#   
#1     1     6
#2     5     8
#3     9    12
#4    11    18
#5 19 NA
#6 20 NA

对于评论中的更新请求,我们可以执行以下操作:

df %>%
  group_by(grp = data.table::rleid(author)) %>%
  slice(1L, n()) %>%
  mutate(author = row_number()) %>%
  tidyr::pivot_wider(names_from = row, values_from = id) %>%
  ungroup %>%
  select(-grp)

# A tibble: 5 x 2
#    `1`   `2`
#  <int> <int>
#1     1     5
#2     6     8
#3     9    11
#4    12    18
#5    19    20
df%>%
分组依据(grp=data.table::rleid(作者))%>%
切片(1L,n())%>%
变异(作者=行号())%>%
tidyr::pivot\u加宽(名称\u from=行,值\u from=id)%>%
解组%>%
选择(-grp)
#一个tibble:5x2
#    `1`   `2`
#   
#1     1     5
#2     6     8
#3     9    11
#4    12    18
#5    19    20

这是一个基本的R选项

z <- rle(df$author)
lst <- split(df,findInterval(1:nrow(df),cumsum(z$lengths), left.open = TRUE))
u <- lapply(lst,function(v) range(v$id))
idx <- split(seq_along(z$values),z$values)
x <- lapply(idx,function(v) unlist(u[v],use.names = FALSE))
df.desired <- as.data.frame(lapply(x,`length<-`,max(lengths(x))))

使用
数据的选项。表

library(data.table)
dcast(
    setDT(df)[, ri := rleid(author)][, id[c(1L, .N)], .(author, ri)],
    rowid(author) ~ author, value.var="V1")
输出:

   author  A  B
1:      1  1  6
2:      2  5  8
3:      3  9 12
4:      4 11 18
5:      5 19 NA
6:      6 20 NA

如果一个作者可能只有一行,您需要
unique(c(1L,.N))

太好了,谢谢!我能做个跟进吗?如果我想要一行来指定每个作者的序列,该怎么办?因此,在本例中X1@AntVal可以在我的代码中使用
as.data.frame(do.call(rbind,u))
u
生成
df
@AntVal,这是原始答案中的一个小变化。请参阅更新的答案。相关:
   author  A  B
1:      1  1  6
2:      2  5  8
3:      3  9 12
4:      4 11 18
5:      5 19 NA
6:      6 20 NA