转换数据帧列表:不是简单的rbind,第二行到新列
转换数据帧列表:不是简单的rbind,第二行到新列 我有一张单子转换数据帧列表:不是简单的rbind,第二行到新列,r,dplyr,bind,rows,R,Dplyr,Bind,Rows,转换数据帧列表:不是简单的rbind,第二行到新列 我有一张单子 employeesList = list(data.frame(first = ("Al"), second = "Jones"), data.frame(first = c("Al", "Barb"), second = c("Jones", "Smith")), data.frame(first = c("Al", "Barb", "Carol"), s
employeesList = list(data.frame(first = ("Al"), second = "Jones"),
data.frame(first = c("Al", "Barb"), second = c("Jones", "Smith")),
data.frame(first = c("Al", "Barb", "Carol"), second = c("Jones", "Smith", "Adams")),
data.frame(first = ("Al"), second = "Jones"))
我期待着生产这个
employeesDF = data.frame(first = c("Al", "Al", "Al", "Al"), second = c("Jones", "Jones", "Jones", "Jones"),
first2 = c(NA, "Barb", "Barb", NA), second2 = c(NA, "Smith", "Smith", NA),
first3 = c(NA, NA, "Carol", NA), second3 = c(NA, NA, "Adams", NA))
我希望每个数据框都是结果数据框中的一行。
请注意,转换时的第一个数据帧将有两列,转换时的第二个df将有四列,转换时的第三个df将产生6列,转换时的第四个df将产生2列,依此类推。我意识到必须有一个NA值填充
我做了一些研究,如果目标只是rbind,问题就解决了。我看不到解决我问题的办法。
看完这篇文章,,
我开始学英语了
res1 = cbind(t(employeesList[[1]][1]), t(employeesList[[1]][2]))
res2 = cbind(t(employeesList[[2]][1]), t(employeesList[[2]][2]))
res3 = cbind(t(employeesList[[3]][1]), t(employeesList[[3]][2]))
res4 = cbind(t(employeesList[[4]][1]), t(employeesList[[4]][2]))
然后
但是我可能有大量的数据帧--res1,…,resn。列数事先未指定,但可能少于10列。我的过程没有命名列,我认为这是绑定行所需要的。我们可以使用
lappy
将列表转换为单行数据帧,然后使用bind\u rows
将多个数据帧绑定在一起
library(dplyr)
bind_rows(lapply(employeesList, function(x) rbind.data.frame(c(t(x)))))
# X.Al. X.Jones. X.Barb. X.Smith. X.Carol. X.Adams.
#1 Al Jones <NA> <NA> <NA> <NA>
#2 Al Jones Barb Smith <NA> <NA>
#3 Al Jones Barb Smith Carol Adams
#4 Al Jones <NA> <NA> <NA> <NA>
库(dplyr)
绑定行(lappy(employeesList,function(x)rbind.data.frame(c(t(x)щщ)'))
#X.Al。琼斯先生。倒钩。史密斯先生。卡罗尔。亚当斯。
#1艾尔·琼斯
#2艾尔·琼斯·巴伯·史密斯
#3艾尔·琼斯·巴伯·史密斯·卡罗尔·亚当斯
#4艾尔·琼斯
根据我们的偏好,我们可以稍后使用
setNames
重命名列。可能是使用data.table
library('data.table')
rbindlist( l = lapply( employeesList, function(x) {
dcast( data = melt( setDT( x ), measure.vars = c( 'first', 'second'))[, V1 := seq_along(value), by = variable][],
formula = " . ~ variable + V1")[, -1]
}),
fill = TRUE,
use.names = TRUE )
# first_1 second_1 first_2 second_2 first_3 second_3
# 1: Al Jones NA NA NA NA
# 2: Al Jones Barb Smith NA NA
# 3: Al Jones Barb Smith Carol Adams
# 4: Al Jones NA NA NA NA
这里有一个选项带有
聚集/扩散
library(tidyverse)
employeesList %>%
map_df(~ .x %>%
mutate_all(as.character) %>% # convert columns to character class
mutate(n = row_number(), n = replace(n, n==1, "")),
.id = 'grp') %>%
group_by(grp) %>%
gather(key, val, first:second) %>% # gather to long format
arrange(grp, n) %>%
unite(keyn, key, n, sep="") %>% # unite columns to create new column
ungroup %>%
mutate(keyn = factor(keyn, levels = unique(keyn))) %>% # for column order
spread(keyn, val) %>% # spread to wide format
select(-grp)
# A tibble: 4 x 6
# first second first2 second2 first3 second3
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 Al Jones NA NA NA NA
#2 Al Jones Barb Smith NA NA
#3 Al Jones Barb Smith Carol Adams
#4 Al Jones NA NA NA NA
库(tidyverse)
员工名单%>%
map_df(~.x%>%
mutate_all(as.character)%>%#将列转换为字符类
mutate(n=row_number(),n=replace(n,n==1,“”),
.id='grp')%>%
分组依据(grp)%>%
聚集(键、值、第一个:第二个)%>%#聚集为长格式
排列(grp,n)%>%
联合(键,键,n,sep=“”)%>%#联合列以创建新列
解组%>%
对列顺序进行变异(keyn=因子(keyn,levels=唯一(keyn))%>%#
排列(键,值)%>%#排列为宽格式
选择(-grp)
#一个tibble:4x6
#第一秒第一2秒第二2秒第一3秒3
#
#1艾尔·琼斯·纳纳
#2艾尔·琼斯·巴伯·史密斯·纳纳
#3艾尔·琼斯·巴伯·史密斯·卡罗尔·亚当斯
#4艾尔·琼斯·纳纳
我在提交示例数据时犯了一个错误。在两个不同的方面,它不够笼统。
列名的变化可能不一致,数据的变化可能比指示的要大得多。
然后我在r-help问了我的问题。在那里,这个问题得到了多方面的回答。
以下是其他人提出的解决方案以及我的时间研究
# input data (list of data frames and data frames may have multiple rows)
employees4List = list(data.frame(first1 = "Al", second1 =
"Jones"),
data.frame(first2 = c("Al2", "Barb"),
second2 = c("Jones", "Smith")),
data.frame(first3 = c("Al3", "Barbara",
"Carol"),
second3 = c("Jones", "Smith",
"Adams")),
data.frame(first4 = ("Al"), second4 =
"Jones2"))
employees4List
# intermediate step (list of data frames with each just one row)
df1 = data.frame(First1 = "Al", Second1 = "Jones",
First2 = NA, Second2 = NA,
First3 = NA, Second3 = NA,
First4 = NA, Second4 = NA)
df2 = data.frame(First1 = "Al2", Second1 = "Jones",
First2 = "Barb", Second2 = "Smith",
First3 = NA, Second3 = NA,
First4 = NA, Second4 = NA)
df3 = data.frame(First1 = "Al3", Second1 = "Jones",
First2 = "Barbara", Second2 = "Smith",
First3 = "Carol", Second3 = "Adams",
First4 = NA, Second4 = NA)
df4 = data.frame(First1 = "Al", Second1 = "Jones2",
First2 = NA, Second2 = NA,
First3 = NA, Second3 = NA,
First4 = NA, Second4 = NA)
listFinal = list(df1, df2, df3, df4)
listFinal
# Expected final step, except that all columns should be character
# Just one data frame
dplyr::bind_rows(listFinal)
sapply(dplyr::bind_rows(listFinal), class)
# Solution 1 using base R by Sarah Goslee
dfbycol <- function(x) {
x <- lapply(x, function(y)as.vector(t(as.matrix(y))))
x <- lapply(x, function(y){length(y) <- max(sapply(x, length)); y})
x <- do.call(rbind, x)
x <- data.frame(x, stringsAsFactors=FALSE)
colnames(x) <- paste0(c("first", "last"), rep(seq(1, ncol(x)/2), each=2))
x
}
dfbycol(listFinal)
##########
# Solution 2 by Jeff Newmiller (Base R)
myrename2 <- function( DF, m ) {
# if a pair of columns is not present, raise an error
stopifnot( 2 == length( DF ) )
n <- nrow( DF )
# use memory layout of elements of matrix
# t() automatically converts to matrix (nrow=2)
# matrix(,nrow=1) re-interprets the column-major output of t()
# as a single row matrix
result <- as.data.frame( matrix( t( DF ), nrow = 1 )
, stringsAsFactors = FALSE
)
if ( n < m ) {
result[ , seq( 2 * n + 1, 2 * m ) ] <- NA
}
setNames( result
, sprintf( "%s%d"
, c( "First", "Second" )
, rep( seq.int( m ), each = 2 )
)
)
}
m <- max( unlist( lapply( employees4List, nrow ) ) )
listFinal2 <- lapply( employees4List, myrename2, m = m )
listFinal2
result2 <- do.call( rbind, listFinal2 )
result2
##########
# Solution 3 by Jeff Newmiller (uses dplyr)
myrename3 <- function( DF ) {
# if a pair of columns is not present, raise an error
stopifnot( 2 == length( DF ) )
n <- nrow( DF )
# use memory layout of elements of matrix
# t() automatically converts to matrix (nrow=2)
# matrix(,nrow=1) re-interprets the column-major output of t()
# as a single row matrix
setNames( as.data.frame( matrix( t( DF ), nrow = 1 )
, stringsAsFactors = FALSE
)
, sprintf( "%s%d"
, c( "First", "Second" )
, rep( seq.int( n ), each = 2 )
)
)
}
listFinal3 <- lapply( employees4List, myrename3 )
listFinal3
result3 <- dplyr::bind_rows( listFinal3 )
result3
# Solution 4 by Jeff Newmiller (uses dplyr and tidyr)
library(dplyr)
library(tidyr)
myrename4 <- function( DF ) {
# if a pair of columns is not present, raise an error
stopifnot( 2 == length( DF ) )
names( DF ) <- c( "a", "b" )
m <- nrow( DF )
( DF
%>% mutate_all( as.character )
%>% mutate( rw = LETTERS[ seq.int( n() ) ] )
%>% gather( col, val, -rw )
%>% tidyr::unite( "labels", rw, col, sep="" )
%>% spread( labels, val )
%>% setNames( sprintf( "%s%d"
, c( "First", "Second" )
, rep( seq.int( m ), each = 2 )
)
)
)
}
listFinal4 <- lapply( employees4List, myrename3)
listFinal4
result4 <- dplyr::bind_rows(listFinal4)
result4
#####
# Timing
# Create a large dataset
firsts = c("Al", "Barb", "Carol")
seconds = c("Washington", "Adams", "Jefferson" )
numReplications = 10000
set.seed(2018)
# Create data frames
sim_list1 = replicate(n = numReplications,
expr = {data.frame(first = base::sample(x = firsts, size = 1, replace = TRUE),
second = base::sample(x = seconds, size = 1, replace = TRUE))},
simplify = F)
sim_list2 = replicate(n = numReplications,
expr = {data.frame(first = base::sample(x = firsts, size = 2, replace = TRUE),
second = base::sample(x = seconds, size = 2, replace = TRUE))},
simplify = F)
sim_list3 = replicate(n = numReplications,
expr = {data.frame(first = base::sample(x = firsts, size = 3, replace = TRUE),
second = base::sample(x = seconds, size = 3, replace = TRUE))},
simplify = F)
# Create list
employeesList = c(sim_list1, sim_list2, sim_list3)
# Method 1
system.time(res1 <- dfbycol(employeesList))
# > system.time(dfbycol(employeesList))
# user system elapsed
# 757.87 0.18 758.62
# res1
rm(res1)
#####
# Method 2
system.time(m <- max( unlist( lapply( employeesList, nrow ) ) ))
# user system elapsed
# 0.22 0.00 0.22
system.time(listFinal2 <- lapply( employeesList, myrename2, m = m ) )
listFinal2
# user system elapsed
# 16.16 0.01 16.18
system.time(result2 <- do.call( rbind, listFinal2 ) )
# result2
# user system elapsed
# 3.96 0.00 3.96
rm(listFinal2)
rm(result2)
#####
# Method 3
system.time(listFinal3 <- lapply( employeesList, myrename3))
# user system elapsed
# 7.33 0.00 7.33
listFinal3
system.time(result3 <- dplyr::bind_rows( listFinal3 ))
# user system elapsed
# 0.17 0.00 0.17
rm(listFinal3)
rm(result3)
#####
# Method 4
system.time(listFinal4 <- lapply( employeesList, myrename4) )
# user system elapsed
# 400.05 0.04 400.24
listFinal4
system.time(result4 <- dplyr::bind_rows( listFinal4 ) )
# user system elapsed
# 0.17 0.00 0.17
# result4
#输入数据(数据帧列表和数据帧可能有多行)
employees4List=list(data.frame(first1=“Al”,second1=
“琼斯”),
数据帧(first2=c(“Al2”,“倒钩”),
second2=c(“琼斯”、“史密斯”),
data.frame(first3=c(“Al3”,“Barbara”,
“卡罗尔”),
second3=c(“琼斯”、“史密斯”,
"亚当斯"),,
data.frame(first4=(“Al”),second4=
“琼斯2号”)
雇员4名单
#中间步骤(每个数据帧只有一行的数据帧列表)
df1=data.frame(First1=“Al”,Second1=“Jones”,
第一个2=NA,第二个2=NA,
第一个3=NA,第二个3=NA,
第一个4=NA,第二个4=NA)
df2=data.frame(First1=“Al2”,Second1=“Jones”,
First2=“倒钩”,Second2=“史密斯”,
第一个3=NA,第二个3=NA,
第一个4=NA,第二个4=NA)
df3=data.frame(First1=“Al3”,Second1=“Jones”,
First2=“芭芭拉”,Second2=“史密斯”,
First3=“卡罗尔”,Second3=“亚当斯”,
第一个4=NA,第二个4=NA)
df4=data.frame(First1=“Al”,Second1=“Jones2”,
第一个2=NA,第二个2=NA,
第一个3=NA,第二个3=NA,
第一个4=NA,第二个4=NA)
listFinal=列表(df1、df2、df3、df4)
决赛
#预期为最后一步,但所有列都应为字符
#只有一个数据帧
dplyr::绑定_行(listFinal)
sapply(dplyr::bind_行(listFinal),类)
#解决方案1使用Sarah Goslee的base R
我决定对这三种方法进行时间测试。这种方法显然是赢家。使用此链接,我创建了数据帧列表。数据帧有一行、两行或三行。这三种类型各有10000个数据帧。方法1 dplyr#用户系统运行时间#44.94 0.00 45.03方法2数据表#用户系统运行时间#225.36 4.68 229.61方法3 tidyverse#用户系统运行时间#513.72 0.23 519.52
# input data (list of data frames and data frames may have multiple rows)
employees4List = list(data.frame(first1 = "Al", second1 =
"Jones"),
data.frame(first2 = c("Al2", "Barb"),
second2 = c("Jones", "Smith")),
data.frame(first3 = c("Al3", "Barbara",
"Carol"),
second3 = c("Jones", "Smith",
"Adams")),
data.frame(first4 = ("Al"), second4 =
"Jones2"))
employees4List
# intermediate step (list of data frames with each just one row)
df1 = data.frame(First1 = "Al", Second1 = "Jones",
First2 = NA, Second2 = NA,
First3 = NA, Second3 = NA,
First4 = NA, Second4 = NA)
df2 = data.frame(First1 = "Al2", Second1 = "Jones",
First2 = "Barb", Second2 = "Smith",
First3 = NA, Second3 = NA,
First4 = NA, Second4 = NA)
df3 = data.frame(First1 = "Al3", Second1 = "Jones",
First2 = "Barbara", Second2 = "Smith",
First3 = "Carol", Second3 = "Adams",
First4 = NA, Second4 = NA)
df4 = data.frame(First1 = "Al", Second1 = "Jones2",
First2 = NA, Second2 = NA,
First3 = NA, Second3 = NA,
First4 = NA, Second4 = NA)
listFinal = list(df1, df2, df3, df4)
listFinal
# Expected final step, except that all columns should be character
# Just one data frame
dplyr::bind_rows(listFinal)
sapply(dplyr::bind_rows(listFinal), class)
# Solution 1 using base R by Sarah Goslee
dfbycol <- function(x) {
x <- lapply(x, function(y)as.vector(t(as.matrix(y))))
x <- lapply(x, function(y){length(y) <- max(sapply(x, length)); y})
x <- do.call(rbind, x)
x <- data.frame(x, stringsAsFactors=FALSE)
colnames(x) <- paste0(c("first", "last"), rep(seq(1, ncol(x)/2), each=2))
x
}
dfbycol(listFinal)
##########
# Solution 2 by Jeff Newmiller (Base R)
myrename2 <- function( DF, m ) {
# if a pair of columns is not present, raise an error
stopifnot( 2 == length( DF ) )
n <- nrow( DF )
# use memory layout of elements of matrix
# t() automatically converts to matrix (nrow=2)
# matrix(,nrow=1) re-interprets the column-major output of t()
# as a single row matrix
result <- as.data.frame( matrix( t( DF ), nrow = 1 )
, stringsAsFactors = FALSE
)
if ( n < m ) {
result[ , seq( 2 * n + 1, 2 * m ) ] <- NA
}
setNames( result
, sprintf( "%s%d"
, c( "First", "Second" )
, rep( seq.int( m ), each = 2 )
)
)
}
m <- max( unlist( lapply( employees4List, nrow ) ) )
listFinal2 <- lapply( employees4List, myrename2, m = m )
listFinal2
result2 <- do.call( rbind, listFinal2 )
result2
##########
# Solution 3 by Jeff Newmiller (uses dplyr)
myrename3 <- function( DF ) {
# if a pair of columns is not present, raise an error
stopifnot( 2 == length( DF ) )
n <- nrow( DF )
# use memory layout of elements of matrix
# t() automatically converts to matrix (nrow=2)
# matrix(,nrow=1) re-interprets the column-major output of t()
# as a single row matrix
setNames( as.data.frame( matrix( t( DF ), nrow = 1 )
, stringsAsFactors = FALSE
)
, sprintf( "%s%d"
, c( "First", "Second" )
, rep( seq.int( n ), each = 2 )
)
)
}
listFinal3 <- lapply( employees4List, myrename3 )
listFinal3
result3 <- dplyr::bind_rows( listFinal3 )
result3
# Solution 4 by Jeff Newmiller (uses dplyr and tidyr)
library(dplyr)
library(tidyr)
myrename4 <- function( DF ) {
# if a pair of columns is not present, raise an error
stopifnot( 2 == length( DF ) )
names( DF ) <- c( "a", "b" )
m <- nrow( DF )
( DF
%>% mutate_all( as.character )
%>% mutate( rw = LETTERS[ seq.int( n() ) ] )
%>% gather( col, val, -rw )
%>% tidyr::unite( "labels", rw, col, sep="" )
%>% spread( labels, val )
%>% setNames( sprintf( "%s%d"
, c( "First", "Second" )
, rep( seq.int( m ), each = 2 )
)
)
)
}
listFinal4 <- lapply( employees4List, myrename3)
listFinal4
result4 <- dplyr::bind_rows(listFinal4)
result4
#####
# Timing
# Create a large dataset
firsts = c("Al", "Barb", "Carol")
seconds = c("Washington", "Adams", "Jefferson" )
numReplications = 10000
set.seed(2018)
# Create data frames
sim_list1 = replicate(n = numReplications,
expr = {data.frame(first = base::sample(x = firsts, size = 1, replace = TRUE),
second = base::sample(x = seconds, size = 1, replace = TRUE))},
simplify = F)
sim_list2 = replicate(n = numReplications,
expr = {data.frame(first = base::sample(x = firsts, size = 2, replace = TRUE),
second = base::sample(x = seconds, size = 2, replace = TRUE))},
simplify = F)
sim_list3 = replicate(n = numReplications,
expr = {data.frame(first = base::sample(x = firsts, size = 3, replace = TRUE),
second = base::sample(x = seconds, size = 3, replace = TRUE))},
simplify = F)
# Create list
employeesList = c(sim_list1, sim_list2, sim_list3)
# Method 1
system.time(res1 <- dfbycol(employeesList))
# > system.time(dfbycol(employeesList))
# user system elapsed
# 757.87 0.18 758.62
# res1
rm(res1)
#####
# Method 2
system.time(m <- max( unlist( lapply( employeesList, nrow ) ) ))
# user system elapsed
# 0.22 0.00 0.22
system.time(listFinal2 <- lapply( employeesList, myrename2, m = m ) )
listFinal2
# user system elapsed
# 16.16 0.01 16.18
system.time(result2 <- do.call( rbind, listFinal2 ) )
# result2
# user system elapsed
# 3.96 0.00 3.96
rm(listFinal2)
rm(result2)
#####
# Method 3
system.time(listFinal3 <- lapply( employeesList, myrename3))
# user system elapsed
# 7.33 0.00 7.33
listFinal3
system.time(result3 <- dplyr::bind_rows( listFinal3 ))
# user system elapsed
# 0.17 0.00 0.17
rm(listFinal3)
rm(result3)
#####
# Method 4
system.time(listFinal4 <- lapply( employeesList, myrename4) )
# user system elapsed
# 400.05 0.04 400.24
listFinal4
system.time(result4 <- dplyr::bind_rows( listFinal4 ) )
# user system elapsed
# 0.17 0.00 0.17
# result4