R-嵌套列表到TIBLE_R_Dplyr_Magrittr

R-嵌套列表到TIBLE

R-嵌套列表到TIBLE,r,dplyr,magrittr,R,Dplyr,Magrittr,我有一个嵌套列表，如下所示： > ex <- list(list(c("This", "is", "an", "example", "."), c("I", "really", "hate", "examples", ".")), list(c("How", "do", "you", "feel", "about", "examples", "?"))) > ex [[1]] [[1]][[1]] [1] "This" "is" "an" "examp

我有一个嵌套列表，如下所示：

> ex <- list(list(c("This", "is", "an", "example", "."), c("I", "really", "hate", "examples", ".")), list(c("How", "do", "you", "feel", "about", "examples", "?")))
> ex
[[1]]
[[1]][[1]]
[1] "This"    "is"      "an"      "example" "."      

[[1]][[2]]
[1] "I"        "really"   "hate"     "examples" "."       


[[2]]
[[2]][[1]]
[1] "How"      "do"       "you"      "feel"     "about"    "examples" "?"

我想将其转换为tibble，如下所示：

> tibble(d_id = as.integer(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2)),
+        s_id = as.integer(c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1)),
+        t_id = as.integer(c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7)),
+        token = c("This", "is", "an", "example", ".", "I", "really",
+                  "hate", "examples", ".", "How", "do", "you", "feel", "about", "examples", "?"))
# A tibble: 17 x 4
    d_id  s_id  t_id token   
   <int> <int> <int> <chr>   
 1     1     1     1 This    
 2     1     1     2 is      
 3     1     1     3 an      
 4     1     1     4 example 
 5     1     1     5 .       
 6     1     2     1 I       
 7     1     2     2 really  
 8     1     2     3 hate    
 9     1     2     4 examples
10     1     2     5 .       
11     2     1     1 How     
12     2     1     2 do      
13     2     1     3 you     
14     2     1     4 feel    
15     2     1     5 about   
16     2     1     6 examples
17     2     1     7 ?

对我来说，最有效的方法是什么？最好使用tidyverse功能？

我们可以

ex %>% 
   set_names(seq_along(ex)) %>% 
   map( ~ set_names(.x, seq_along(.x)) %>% 
                 stack) %>% 
   bind_rows(.id = 'd_id') %>%
   group_by(d_id, s_id = ind) %>% 
   mutate(t_id = row_number()) %>% 
   select(d_id, s_id, t_id, token = values)
# A tibble: 17 x 4
# Groups: d_id, s_id [3]
#   d_id  s_id   t_id token   
#   <chr> <chr> <int> <chr>   
# 1 1     1         1 This    
# 2 1     1         2 is      
# 3 1     1         3 an      
# 4 1     1         4 example 
# 5 1     1         5 .       
# 6 1     2         1 I       
# 7 1     2         2 really  
# 8 1     2         3 hate    
# 9 1     2         4 examples
#10 1     2         5 .       
#11 2     1         1 How     
#12 2     1         2 do      
#13 2     1         3 you     
#14 2     1         4 feel    
#15 2     1         5 about   
#16 2     1         6 examples
#17 2     1         7 ?

是时候让一些序列工作了，这应该是非常有效的：

d_id <- rep(seq_along(ex), lengths(ex))
s_id <- sequence(lengths(ex))
t_id <- lengths(unlist(ex, rec=FALSE))

data.frame(
  d_id  = rep(d_id, t_id),
  s_id  = rep(s_id, t_id),
  t_id  = sequence(t_id),
  token = unlist(ex)
)

#   d_id s_id t_id    token
#1     1    1    1     This
#2     1    1    2       is
#3     1    1    3       an
#4     1    1    4  example
#5     1    1    5        .
#6     1    2    1        I
#7     1    2    2   really
#8     1    2    3     hate
#9     1    2    4 examples
#10    1    2    5        .
#11    2    1    1      How
#12    2    1    2       do
#13    2    1    3      you
#14    2    1    4     feel
#15    2    1    5    about
#16    2    1    6 examples
#17    2    1    7        ?

这将在大约2秒钟内为您的ex列表的500K样本运行。我认为这在效率方面很难被击败。

您可以使用Reformae2包装中的melt：

library(data.table)
setDT(melt(ex))[, .(d_id = L1, s_id = L2, t_id = rowid(L1, L2), token = value)]

    d_id s_id t_id    token
 1:    1    1    1     This
 2:    1    1    2       is
 3:    1    1    3       an
 4:    1    1    4  example
 5:    1    1    5        .
 6:    1    2    1        I
 7:    1    2    2   really
 8:    1    2    3     hate
 9:    1    2    4 examples
10:    1    2    5        .
11:    2    1    1      How
12:    2    1    2       do
13:    2    1    3      you
14:    2    1    4     feel
15:    2    1    5    about
16:    2    1    6 examples
17:    2    1    7        ?

我在这里用data.table显示它，因为我知道如何从那里一步完成列选择和重命名，尽管使用dplyr应该不会有问题。melt.list函数来自Reformate2。

另一种tidyverse解决方案：

library(tidyverse)
ex %>%
  modify_depth(-1,~tibble(token=.x) %>% rowid_to_column("t_id")) %>%
  map(~map_dfr(.x,identity,.id = "s_id")) %>%
  map_dfr(identity,.id = "d_id")

# # A tibble: 17 x 4
#     d_id  s_id  t_id    token
#    <chr> <chr> <int>    <chr>
#  1     1     1     1     This
#  2     1     1     2       is
#  3     1     1     3       an
#  4     1     1     4  example
#  5     1     1     5        .
#  6     1     2     1        I
#  7     1     2     2   really
#  8     1     2     3     hate
#  9     1     2     4 examples
# 10     1     2     5        .
# 11     2     1     1      How
# 12     2     1     2       do
# 13     2     1     3      you
# 14     2     1     4     feel
# 15     2     1     5    about
# 16     2     1     6 examples
# 17     2     1     7        ?

不，请关闭，但如果您运行代码，您将看到这并不完全正确。@ChristopherCostello不确定哪里不正确。由于我喜欢使用整洁的函数，所以我得到了与您预期的运动相同的输出。当我运行它时会收到警告，但它不会返回我想要的数据类型。@ChristopherCostello当然，这是个问题，只是因为我们使用了堆栈。它可以通过添加%>%mutate_allfunsype.convertas.character.，as.is=trues来改变。这肯定会很有效。我最喜欢的答案。谢谢大家!@Moody你是不是只是用data.table来试的？它应该会起作用。data.table:：melt在获取非data.table时，会将其分派给Reformate2 melt.*函数。另外，fwiw，我故意不把代码标记放在包名上，因为它们只是名称而不是代码…很抱歉，我刚才看到你提到了包，并认为你忘了包括我回滚的库callI，我不是故意搞砸你的风格；没问题…：