R 如何有效地将嵌套列表和数据帧展平为单个数据帧?
我有一些数据的格式化方式很难使用,所以我正在尝试将其展平。这个 期望R 如何有效地将嵌套列表和数据帧展平为单个数据帧?,r,R,我有一些数据的格式化方式很难使用,所以我正在尝试将其展平。这个 期望 > expectedOutcome Events.CateringOptions.Agreed Events.CateringOptions.Tnc.Identity Events.CateringOptions.Tnc.Schema Events.CateringOptions.Tnc.ElementId 1 NA
> expectedOutcome
Events.CateringOptions.Agreed Events.CateringOptions.Tnc.Identity Events.CateringOptions.Tnc.Schema Events.CateringOptions.Tnc.ElementId
1 NA SpicyOWing TRUE 105031
2 NA BaconEggs TRUE 105032
Events.CateringOptions.Tnc.ElementType Events.CateringOptions.Tnc.ElementVersion Events.CateringOptions.Address Events.Action Events.Volume Host.Identity
1 NA NA New York 1 1000 John
2 NA NA Seattle 1 2000 John
Host.Schema Host.ElementId Host.ElementType Host.ElementVersion Sender.Identity Sender.Schema Sender.ElementId Sender.ElementType Sender.ElementVersion
1 NA 101505 NA NA Jane NA 101005 NA NA
2 NA 101505 NA NA Jane NA 101005 NA NA
CompletedDate
1 /Date(1490112000000)/
2 /Date(1490112000000)/
检查功能
check<-function(li){
areDF<-sapply(1:length(li), function(i) class(li[[i]]) == "data.frame")
areList<-sapply(1:length(li), function(i) class(li[[i]]) == "list")
tmp1 <- NULL
tmp2 <- NULL
if(any(areDF)){
for(j in which(areDF)){
columns <- jsonlite::flatten(li[[j]])
li[[j]] <- check(columns)
}
tmp1<-plyr::rbind.fill(li[areDF])
#return(tmp1)
}
if(any(areList)){
for(j in which(areList)){
li[[j]]<-check(li[[j]])
}
tmp2<-do.call(cbind,li)
#return(tmp2)
}
if(!is.null(tmp1) & !is.null(tmp2)){
return (cbind(tmp1,tmp2))
}
else if(!is.null(tmp1)){
return (tmp1)
}
else if(!is.null(tmp2)){
return (tmp2)
}
return(li)
}
我差不多有了,但是嵌套的数据帧被复制了。而且,我的代码需要相当长的时间。有人知道我该怎么把它弄平吗
编辑:
我最后在中添加了我的解决方案,这是我的看法,在
purrr
的帮助下这个想法与您的想法类似,只是语法不同:
flant()
最嵌套的数据帧,然后rbind()
它们。如果我正确理解了您的代码,我的代码在最后会略有不同,因为我将尝试获得一个更“
jsonlite::flatte
-友好”的结构,以便再次将其应用于最终结果:
library(jsonlite)
library(purrr)
res <-
sampleData %>%
modify_if(
is.list,
.f = ~ modify_if(
.x,
.p = function(x) all(sapply(x, is.data.frame)),
.f = ~ do.call("rbind", lapply(.x, jsonlite::flatten))
)
) %>%
as.data.frame() %>%
jsonlite::flatten()
str(res)
# 'data.frame': 2 obs. of 20 variables:
# $ Events.Action : num 1 1
# $ Events.Volume : num 1000 2000
# $ Host.Identity : chr "John" "John"
# $ Host.Schema : logi NA NA
# $ Host.ElementId : chr "101505" "101505"
# $ Host.ElementType : logi NA NA
# $ Host.ElementVersion : logi NA NA
# $ Sender.Identity : chr "Jane" "Jane"
# $ Sender.Schema : logi NA NA
# $ Sender.ElementId : chr "101005" "101005"
# $ Sender.ElementType : logi NA NA
# $ Sender.ElementVersion : logi NA NA
# $ CompletedDate : chr "/Date(1490112000000)/" "/Date(1490112000000)/"
# $ Events.CateringOptions.Agreed : logi TRUE TRUE
# $ Events.CateringOptions.Address : chr "New York" "Seattle"
# $ Events.CateringOptions.Tnc.Identity : chr "SpicyOWing" "BaconEggs"
# $ Events.CateringOptions.Tnc.Schema : logi NA NA
# $ Events.CateringOptions.Tnc.ElementId : chr "105031" "105032"
# $ Events.CateringOptions.Tnc.ElementType : logi NA NA
# $ Events.CateringOptions.Tnc.ElementVersion: logi NA NA
不确定这是否过度简化了您的问题,但对于您共享的示例,它似乎有效。基本上,如果在执行
data.frame(您的_列表)
时列还不是向量,则它取消列出数据并生成一个矩阵
FLAT <- function(inlist) {
A <- data.frame(inlist)
out <- lapply(A, function(y) {
if (is.list(y)) {
y <- unlist(y)
m <- matrix(y, nrow(A), byrow = TRUE, dimnames = list(NULL, unique(names(y))))
y <- data.frame(m, stringsAsFactors = FALSE)
y[] <- lapply(y, type.convert)
}
y
})
do.call(cbind, out)
}
FLAT(sampleData)
首先询问数据是如何进入这个表单的,这也许不是不合理的——Json来自一个传递给opencpu的api,opencpu对其调用fromJSON
。如果我可以修改OpenCPU的默认值fromJSON
来添加argsflatte=TRUE,simplifivector=FALSE
,这会更容易些,但是我无法控制。是的,我的错,我现在已经纠正了。谢谢你的努力,我不知道Purr!让我在我的真实世界数据集上尝试一下,看看我可以在哪里使用它。
library(jsonlite)
library(purrr)
res <-
sampleData %>%
modify_if(
is.list,
.f = ~ modify_if(
.x,
.p = function(x) all(sapply(x, is.data.frame)),
.f = ~ do.call("rbind", lapply(.x, jsonlite::flatten))
)
) %>%
as.data.frame() %>%
jsonlite::flatten()
str(res)
# 'data.frame': 2 obs. of 20 variables:
# $ Events.Action : num 1 1
# $ Events.Volume : num 1000 2000
# $ Host.Identity : chr "John" "John"
# $ Host.Schema : logi NA NA
# $ Host.ElementId : chr "101505" "101505"
# $ Host.ElementType : logi NA NA
# $ Host.ElementVersion : logi NA NA
# $ Sender.Identity : chr "Jane" "Jane"
# $ Sender.Schema : logi NA NA
# $ Sender.ElementId : chr "101005" "101005"
# $ Sender.ElementType : logi NA NA
# $ Sender.ElementVersion : logi NA NA
# $ CompletedDate : chr "/Date(1490112000000)/" "/Date(1490112000000)/"
# $ Events.CateringOptions.Agreed : logi TRUE TRUE
# $ Events.CateringOptions.Address : chr "New York" "Seattle"
# $ Events.CateringOptions.Tnc.Identity : chr "SpicyOWing" "BaconEggs"
# $ Events.CateringOptions.Tnc.Schema : logi NA NA
# $ Events.CateringOptions.Tnc.ElementId : chr "105031" "105032"
# $ Events.CateringOptions.Tnc.ElementType : logi NA NA
# $ Events.CateringOptions.Tnc.ElementVersion: logi NA NA
all.equal(expectedOutcome[sort(names(expectedOutcome))], res[sort(names(res))])
# [1] "Component “Events.CateringOptions.Agreed”: 'is.NA' value mismatch: 0 in current 2 in target"
FLAT <- function(inlist) {
A <- data.frame(inlist)
out <- lapply(A, function(y) {
if (is.list(y)) {
y <- unlist(y)
m <- matrix(y, nrow(A), byrow = TRUE, dimnames = list(NULL, unique(names(y))))
y <- data.frame(m, stringsAsFactors = FALSE)
y[] <- lapply(y, type.convert)
}
y
})
do.call(cbind, out)
}
FLAT(sampleData)
str(FLAT(sampleData))
## 'data.frame': 2 obs. of 20 variables:
## $ Events.CateringOptions.Agreed : logi TRUE TRUE
## $ Events.CateringOptions.Tnc.Identity : Factor w/ 2 levels "BaconEggs","SpicyOWing": 2 1
## $ Events.CateringOptions.Tnc.Schema : logi NA NA
## $ Events.CateringOptions.Tnc.ElementId : int 105031 105032
## $ Events.CateringOptions.Tnc.ElementType : logi NA NA
## $ Events.CateringOptions.Tnc.ElementVersion: logi NA NA
## $ Events.CateringOptions.Address : Factor w/ 2 levels "New York","Seattle": 1 2
## $ Events.Action : num 1 1
## $ Events.Volume : num 1000 2000
## $ Host.Identity : Factor w/ 1 level "John": 1 1
## $ Host.Schema : logi NA NA
## $ Host.ElementId : Factor w/ 1 level "101505": 1 1
## $ Host.ElementType : logi NA NA
## $ Host.ElementVersion : logi NA NA
## $ Sender.Identity : Factor w/ 1 level "Jane": 1 1
## $ Sender.Schema : logi NA NA
## $ Sender.ElementId : Factor w/ 1 level "101005": 1 1
## $ Sender.ElementType : logi NA NA
## $ Sender.ElementVersion : logi NA NA
## $ CompletedDate : Factor w/ 1 level "/Date(1490112000000)/": 1 1