R将列表转换为数据帧或表_R_List_Dataframe_Data.table

R将列表转换为数据帧或表

r list dataframe

R将列表转换为数据帧或表,r,list,dataframe,data.table,R,List,Dataframe,Data.table,我有“列表1”，“列表1”中的每个项目都相当于一个人4年的年级、班级和分数。因此，“列表1”有5个学生，每个学生有12条记录（三个变量中的每一个都有4条记录，即年级、班级和分数）。我希望将'list1'转换为'data1'，这是一个长数据文件，其中'ID'等于'list1'中的列表项编号。时间等于记录的时间（每个学生有4个时间测量值），成绩等于列表1中所有元素的前4个数据点，下一个4，最后4分示例输出显示将“list1”转换为所需的输出“data1” 这个数据集非常庞大，因此我希望有一种有效

我有“列表1”，“列表1”中的每个项目都相当于一个人4年的年级、班级和分数。因此，“列表1”有5个学生，每个学生有12条记录（三个变量中的每一个都有4条记录，即年级、班级和分数）。我希望将'list1'转换为'data1'，这是一个长数据文件，其中'ID'等于'list1'中的列表项编号。时间等于记录的时间（每个学生有4个时间测量值），成绩等于列表1中所有元素的前4个数据点，下一个4，最后4分

示例输出显示将“list1”转换为所需的输出“data1”

这个数据集非常庞大，因此我希望有一种有效的方法来进行转换。

一个

purrr

和

dplyr

解决方案可以是：

list1 = list(
  c(4,5,6,7,1,1,1,1,3,1,3,3),
  c(3,4,5,6,2,2,2,2,1,4,2,1),
  c(1,2,3,4,1,1,1,1,3,2,1,1),
  c(5,6,7,8,1,1,1,1,4,4,4,3),
  c(2,3,4,5,2,2,2,2,2,1,2,1)
)

data1=data.frame("ID"=c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5),
"Time"=c(1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4),
"Grade"=c(4,5,6,7,3,4,5,6,1,2,3,4,5,6,7,8,2,3,4,5),
"Class"=c(1,1,1,1,2,2,2,2,1,1,1,1,1,1,1,1,2,2,2,2),
"Score"=c(3,1,3,3,1,4,2,1,3,2,1,1,4,4,4,3,2,1,2,1))

map\u dfr（.x=list1，
~as.data.frame（矩阵（.x，4，3））%>%
设置名称（c（“等级”、“等级”、“分数”），.id=“id”）%>%
分组依据（ID）%>%
突变（时间=1:n（））
ID等级班级得分时间
1 1         4     1     3     1
2 1         5     1     1     2
3 1         6     1     3     3
4 1         7     1     3     4
5 2         3     2     1     1
6 2         4     2     4     2
7 2         5     2     2     3
8 2         6     2     1     4
9 3         1     1     3     1
10 3         2     1     2     2
11 3         3     1     1     3
12 3         4     1     1     4
13 4         5     1     4     1
14 4         6     1     4     2
15 4         7     1     4     3
16 4         8     1     3     4
17 5         2     2     2     1
18 5         3     2     1     2
19 5         4     2     2     3
20 5         5     2     1     4

使用base R，我们可以迭代

列表1的索引，并为每个列表创建一个数据帧
map_dfr(.x = list1, 
        ~ as.data.frame(matrix(.x, 4, 3)) %>%
         setNames(c("Grade", "Class", "Score")), .id = "ID") %>%
 group_by(ID) %>%
 mutate(Time = 1:n())

   ID    Grade Class Score  Time
   <chr> <dbl> <dbl> <dbl> <int>
 1 1         4     1     3     1
 2 1         5     1     1     2
 3 1         6     1     3     3
 4 1         7     1     3     4
 5 2         3     2     1     1
 6 2         4     2     4     2
 7 2         5     2     2     3
 8 2         6     2     1     4
 9 3         1     1     3     1
10 3         2     1     2     2
11 3         3     1     1     3
12 3         4     1     1     4
13 4         5     1     4     1
14 4         6     1     4     2
15 4         7     1     4     3
16 4         8     1     3     4
17 5         2     2     2     1
18 5         3     2     1     2
19 5         4     2     2     3
20 5         5     2     1     4

使用1000万个数据点
数据：
do.call(rbind, lapply(seq_along(list1), function(i) 
        data.frame(ID = i, Time = 1:4, Grade = list1[[i]][1:4], 
                    Class = list1[[i]][5:8], Score = list1[[i]][9:12])))

#   ID Time Grade Class Score
#1   1    1     4     1     3
#2   1    2     5     1     1
#3   1    3     6     1     3
#4   1    4     7     1     3
#5   2    1     3     2     1
#6   2    2     4     2     4
#7   2    3     5     2     2
#8   2    4     6     2     1
#9   3    1     1     1     3
#10  3    2     2     1     2
#11  3    3     3     1     1
#12  3    4     4     1     1
#13  4    1     5     1     4
#14  4    2     6     1     4
#15  4    3     7     1     4
#16  4    4     8     1     3
#17  5    1     2     2     2
#18  5    2     3     2     1
#19  5    3     4     2     2
#20  5    4     5     2     1

比较：使用数据表
list1 = list(
  c(4,5,6,7,1,1,1,1,3,1,3,3),
  c(3,4,5,6,2,2,2,2,1,4,2,1),
  c(1,2,3,4,1,1,1,1,3,2,1,1),
  c(5,6,7,8,1,1,1,1,4,4,4,3),
  c(2,3,4,5,2,2,2,2,2,1,2,1))

big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)

@迈克尔基里科
system.time({
  col_levels <- rep(c('Grade', 'Class', 'Score'), each = 4)

  for(x in seq_along(big_list)){
    big_list[[x]] <- do.call('cbind', list(ID = x, Time = 1:4, 
                                        do.call('cbind', split(big_list[[x]], col_levels))))
  }

  final_df <- do.call('rbind', big_list)      
})

# user  system elapsed 
# 82.86    0.31   83.78

输出
library('data.table')
system.time({
  # 4 = # of Times per ID&Column (assuming your table is rectangular)
  out = CJ(ID = 1:length(big_list), Time = 1:4)
  # relies on ID being an integer, so that ID = 1 --> list1[[1]]
  #   gives ID=1's data
  out[ , by = ID, c('Grade', 'Class', 'Score') := {
    as.data.table(matrix(big_list[[ .BY$ID ]], ncol = 3L))
  }]
})

# user  system elapsed 
# 76.22    0.25   76.80

我不确定它是否有效，但它很简洁：
dim(final_df)
# [1] 2000000      5

head(final_df)
#      ID Time Class Grade Score
# [1,]  1    1     1     4     3
# [2,]  1    2     1     5     1
# [3,]  1    3     1     6     3
# [4,]  1    4     1     7     3
# [5,]  2    1     2     3     1
# [6,]  2    2     2     4     4

可能是as.data.table
效率低下，但此代码比其他代码可读性更好：
# 4 = # of Times per ID&Column (assuming your table is rectangular)
out = CJ(ID = 1:length(list1), Time = 1:4)
# relies on ID being an integer, so that ID = 1 --> list1[[1]]
#   gives ID=1's data
out[ , by = ID, c('Grade', 'Class', 'Score') := {
  as.data.table(matrix(list1[[ .BY$ID ]], ncol = 3L))
}]

这是另一个非常快速的基本解决方案。它不那么优雅，但我们的想法是通过用循环填充矩阵来最小化内存使用
out = CJ(ID = 1:length(list1), Time = 1:4)
out[ , by = ID, c('Grade', 'Class', 'Score') := {
  student_data = list1[[.BY$ID]]
  lapply(1:3, function(j) student_data[4L*(j-1) + 1:4])
}]

这里有一种更快的方法，通过按一定顺序选择未列出的元素，取消列出并生成矩阵：
mat = matrix(0, nrow = length(list1) * 4L, ncol = 5L, dimnames = list(NULL, c("ID", "Time", "Grade", "Class", "Score")))

rw = 0L
times = 1:4

for (i in seq_along(list1)) {
  l = list1[[i]]
  new_rw = length(l) / 3
  inds = seq_len(new_rw) + rw

  mat[inds, 1L] = i
  mat[inds, 2L] = times
  mat[inds, 3:5] = matrix(l, ncol = 3L)

  rw = new_rw + rw
}

最后，如果您仍然需要速度，可以使用Rcpp
：
n = length(list1)
matrix(unlist(list1, use.names = FALSE)[rep(rep(1:4, n) + 12 * rep(0:(n-1L), each = 4), 3) + rep(c(0, 4, 8), each = n * 4L)], ncol = 3)

Rcpp:：cpppfunction(
“数字矩阵rcpp_组合（列表x）{
数值矩阵输出（x.size（）*4,5）；
int init=0；
对于（int i=0；i

使用@Sathish的基准测试，这些方法在0.05到2秒之间
Rcpp::cppFunction(
  " NumericMatrix rcpp_combo(List x) {
  NumericMatrix out(x.size() * 4, 5);
  int init = 0;

  for (int i = 0; i < x.size(); i++) {
    NumericVector tmp = x(i);
    int ID = i + 1;
    for (int j = 0; j < 4; j++) {
      int ind = j + init;

      out(ind, 0) = ID;
      out(ind, 1) = j + 1;
      out(ind, 2) = tmp(j);
      out(ind, 3) = tmp(4 + j);
      out(ind, 4) = tmp(8 + j);
    }
    init += 4;
  }
  return(out);
}"
)
rcpp_combo(list1)    

big_list这在我的一小部分数据中非常有效。但是你知道data.table解决方案吗？因为我知道它可能需要几天的时间（我的数据在列表中有100多万个元素）。请检查@MichaelChirico的数据。如果有帮助，请查看表解决方案。
Rcpp::cppFunction(
  " NumericMatrix rcpp_combo(List x) {
  NumericMatrix out(x.size() * 4, 5);
  int init = 0;

  for (int i = 0; i < x.size(); i++) {
    NumericVector tmp = x(i);
    int ID = i + 1;
    for (int j = 0; j < 4; j++) {
      int ind = j + init;

      out(ind, 0) = ID;
      out(ind, 1) = j + 1;
      out(ind, 2) = tmp(j);
      out(ind, 3) = tmp(4 + j);
      out(ind, 4) = tmp(8 + j);
    }
    init += 4;
  }
  return(out);
}"
)
rcpp_combo(list1)    

big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)

system.time(rcpp_combo(big_list))
##   user  system elapsed 
##   0.07    0.00    0.06 

system.time({
  n = length(big_list)
  mat2 = matrix(unlist(big_list, use.names = FALSE)[rep(rep(1:4, n) + 12 * rep(0:(n-1L), each = 4), 3) + rep(c(0, 4, 8), each = n * 4L)], ncol = 3)
})
##   user  system elapsed 
##   0.20    0.02    0.22 

big_list <- unlist(mget(x = rep('list1', 100000)), recursive = FALSE)
system.time({
mat = matrix(0, nrow = length(big_list) * 4L, ncol = 5L, dimnames = list(NULL, c("ID", "Time", "Grade", "Class", "Score")))
rw = 0L
times = 1:4
for (i in seq_along(big_list)) {
  l = big_list[[i]]
  new_rw = length(l) / 3
  inds = seq_len(new_rw) + rw
  mat[inds, 1L] = i
  mat[inds, 2L] = times
  mat[inds, 3:5] = matrix(l, ncol = 3L)
  rw = new_rw + rw
}
})
##   user  system elapsed 
##   2.08    0.03    2.21