data.frames列表中特定data.frame列上的高效函数
我有一个data.frames列表中特定data.frame列上的高效函数,r,list,dataframe,R,List,Dataframe,我有一个data.frames的列表。比如说 set.seed(1) my_list <- list() ids = c("a","b","c","d","e") for(i in 1:5){ my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(
data.frame
s的列表。比如说
set.seed(1)
my_list <- list()
ids = c("a","b","c","d","e")
for(i in 1:5){
my_list[[i]] <- data.frame(id = ids, p = rnorm(length(ids)), m = rnorm(length(ids)), hp = runif(length(ids)), hm = runif(length(ids)), d = rnorm(length(ids)), a = rnorm(length(ids)))
}
使用
myu列表
library(plyr)
df = do.call(rbind, my_list)
out = ddply(df, .(id), colwise(var, c('p','m','d','a')))
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
或者使用lappy
和apply
df = do.call(rbind, my_list)
df1 = do.call(rbind,
lapply(split(df, df$id),
function(x) apply(subset(x, select = c(p,m,d,a)), 2, var)))
out = transform(df1, id = row.names(df1))
#> out
# p m d a id
#a 0.2371569 1.7810729 0.08264279 0.5074250 a
#b 0.1091675 0.2107997 1.15051229 1.1578691 b
#c 0.5385789 0.7650123 0.44215343 0.3137903 c
#d 1.0174542 0.7818498 0.06414317 0.6079849 d
#e 0.7343667 1.2870542 1.41615858 0.7362462 e
或使用doBy
library(doBy)
df = do.call(rbind, my_list)
out = summaryBy( p + m + d + a ~ id , data = df, keep.names=TRUE, FUN = var)
#> out
# id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
或者使用sqldf
library(sqldf)
df = do.call(rbind, my_list)
out = sqldf("select id, variance(p), variance(m),
variance(d), variance(a) from df group by id")
#> out
# id variance(p) variance(m) variance(d) variance(a)
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
更新为使用
bind_rows()
(比@hadley建议的do.call(rbind,…)
更有效)
库(dplyr)
dat%分组依据(id)%>%汇总每个(funs(var))
#id p m d a
#1 a 0.2371569 1.7810729 0.08264279 0.5074250
#2 b 0.1091675 0.2107997 1.15051229 1.1578691
#3 c 0.5385789 0.7650123 0.44215343 0.3137903
#4 d 1.0174542 0.7818498 0.06414317 0.6079849
#5 e 0.7343667 1.2870542 1.41615858 0.7362462
这里是一个基本的R方法
dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)
bind_rows()
将更有效您可以链接结果以避免中间赋值。。(+1).
library(dplyr)
dat <- bind_rows(dat)[,c("id","p","m","d","a")]
dat %>% group_by(id) %>% summarise_each(funs(var))
# id p m d a
# 1 a 0.2371569 1.7810729 0.08264279 0.5074250
# 2 b 0.1091675 0.2107997 1.15051229 1.1578691
# 3 c 0.5385789 0.7650123 0.44215343 0.3137903
# 4 d 1.0174542 0.7818498 0.06414317 0.6079849
# 5 e 0.7343667 1.2870542 1.41615858 0.7362462
dat <- do.call(rbind,my_list)
aggregate( cbind(p,m,d,a) ~ id, var, data=dat)
id p m d a
1 a 0.2371569 1.7810729 0.08264279 0.5074250
2 b 0.1091675 0.2107997 1.15051229 1.1578691
3 c 0.5385789 0.7650123 0.44215343 0.3137903
4 d 1.0174542 0.7818498 0.06414317 0.6079849
5 e 0.7343667 1.2870542 1.41615858 0.7362462
library(data.table)
rbindlist(my_list)[, lapply(.SD, var), by = id, .SDcols = c("p","m","d","a")]
# id p m d a
# 1: a 0.2371569 1.7810729 0.08264279 0.5074250
# 2: b 0.1091675 0.2107997 1.15051229 1.1578691
# 3: c 0.5385789 0.7650123 0.44215343 0.3137903
# 4: d 1.0174542 0.7818498 0.06414317 0.6079849
# 5: e 0.7343667 1.2870542 1.41615858 0.7362462