R 将函数应用于由所有可能的分类变量组合子集的数据帧
带有分类变量catA、catB和catC的示例数据帧。Obs是一些观察值R 将函数应用于由所有可能的分类变量组合子集的数据帧,r,R,带有分类变量catA、catB和catC的示例数据帧。Obs是一些观察值 catA <- rep(factor(c("a","b","c")), length.out=100) catB <- rep(factor(1:4), length.out=100) catC <- rep(factor(c("d","e","f")), length.out=100) obs <- runif(100,0,100) dat <- data.frame(catA, catB,
catA <- rep(factor(c("a","b","c")), length.out=100)
catB <- rep(factor(1:4), length.out=100)
catC <- rep(factor(c("d","e","f")), length.out=100)
obs <- runif(100,0,100)
dat <- data.frame(catA, catB, catC, obs)
等等等等…
ans这不是最干净的解决方案,但我认为它接近你想要的
ans <- with(dat, tapply(obs, list(catA, catB, catC), mean))
ans <- data.frame(expand.grid(dimnames(ans)), results=c(ans))
names(ans)[1:3] <- names(dat)[1:3]
str(ans)
# 'data.frame': 36 obs. of 4 variables:
# $ catA : Factor w/ 3 levels "a","b","c": 1 2 3 1 2 3 1 2 3 1 ...
# $ catB : Factor w/ 4 levels "1","2","3","4": 1 1 1 2 2 2 3 3 3 4 ...
# $ catC : Factor w/ 3 levels "d","e","f": 1 1 1 1 1 1 1 1 1 1 ...
# $ results: num 69.7 NA NA 55.3 NA ...
getAllSubs <- function(df, lookup, fun) {
out <- lapply(1:nrow(lookup), function(i) {
df_new <- df
if(length(na.omit(unlist(lookup[i,]))) > 0) {
for(j in colnames(lookup)[which(!is.na(unlist(lookup[i,])))]) {
df_new <- df_new[df_new[,j] == lookup[i,j],]
}
}
fun(df_new)
})
if(mean(sapply(out, length) ==1) == 1) {
out <- unlist(out)
} else {
out <- do.call("rbind", out)
}
final <- cbind(lookup, out)
final[is.na(final)] <- NA
final
}
getAllSubs另一种方法,一个函数用于获取所有变量组合,另一个函数用于在所有子集上应用函数。组合功能从另一个帖子中被盗
## return all combinations of vector up to maximum length n
multicombn <- function(dat, n) {
unlist(lapply(1:n, function(x) combn(dat, x, simplify=F)), recursive=F)
}
这个答案和shwaund答案的一个区别是,它不会返回空的行
子集,因此结果列中没有NAs
allsubs(dat, c("catA","catB","catc"), func, out.name="mean")
> head(allsubs(dat, vars, func, out.name = "mean"),20)
catA catB catC mean
1 a 1 d 56.65909
2 a 2 d 54.98116
3 a 3 d 37.52655
4 a 4 d 58.29034
5 b 1 e 52.88945
6 b 2 e 50.43122
7 b 3 e 52.57115
8 b 4 e 59.45348
9 c 1 f 52.41637
10 c 2 f 34.58122
11 c 3 f 46.80256
12 c 4 f 51.58668
13 <NA> 1 d 56.65909
14 <NA> 1 e 52.88945
15 <NA> 1 f 52.41637
16 <NA> 2 d 54.98116
17 <NA> 2 e 50.43122
18 <NA> 2 f 34.58122
19 <NA> 3 d 37.52655
20 <NA> 3 e 52.57115
allsubs(dat,c(“catA”、“catB”、“catc”)、func、out.name=“mean”)
>头部(所有接头(dat、vars、func、out.name=“平均值”),20)
catA catB catC平均值
1A 1D 56.65909
2A 2D 54.98116
3 a 3 d 37.52655
4 a 4 d 58.29034
5 b 1 e 52.88945
6 b 2 e 50.43122
7 b 3 e 52.57115
8 b 4 e 59.45348
9 c 1 f 52.41637
10 c 2 f 34.58122
11 c 3 f 46.80256
12 c 4 f 51.58668
13 1 d 56.65909
14 1 e 52.88945
15 1楼52.41637
16 2 d 54.98116
172E 50.43122
18 2 f 34.58122
19 3 d 37.52655
20 3 e 52.57115
仅使用矢量化函数和基R
# Find all possible subsets of your data
combVars <- c("catA", "catB", "catC")
subsets <- lapply(0:length(combVars), combn, x = combVars, simplify = FALSE)
subsets <- do.call(c, subsets)
# Calculate means by each subset
meanValues <- lapply(subsets, function(x) aggregate(dat[["obs"]], by = dat[x], FUN = mean))
# Pull them all into one dataframe
Reduce(function(x,y) merge(x,y,all=TRUE), meanValues)
#查找所有可能的数据子集
康瓦尔回答得很好。我已将其改编为数据表。
dat_out <- getAllSubs(dat, allsubs, function(x) mean(x$obs, na.rm = TRUE))
head(dat_out,20)
catA catB catC out
1 <NA> <NA> <NA> 47.25446
2 a <NA> <NA> 51.54226
3 b <NA> <NA> 46.45352
4 c <NA> <NA> 43.63767
5 <NA> 1 <NA> 47.23872
6 a 1 <NA> 66.59281
7 b 1 <NA> 32.03513
8 c 1 <NA> 40.66896
9 <NA> 2 <NA> 45.16588
10 a 2 <NA> 50.59323
11 b 2 <NA> 51.02013
12 c 2 <NA> 33.15251
13 <NA> 3 <NA> 51.67809
14 a 3 <NA> 48.13645
15 b 3 <NA> 57.92084
16 c 3 <NA> 49.27710
17 <NA> 4 <NA> 44.93515
18 a 4 <NA> 40.36266
19 b 4 <NA> 44.26717
20 c 4 <NA> 50.74718
## return all combinations of vector up to maximum length n
multicombn <- function(dat, n) {
unlist(lapply(1:n, function(x) combn(dat, x, simplify=F)), recursive=F)
}
func=function(x) mean(x$obs, na.rm=TRUE)
library(plyr)
allsubs <- function(indat, vars, func=NULL, out.name=NULL) {
results <- data.frame()
nvars <- rev(multicombn(vars,length(vars)))
for(i in 1:length(nvars)) {
results <-
rbind.fill(results, ddply(indat, unlist(nvars[i]), func))
}
if(!missing(out.name)) names(results)[length(vars)+1] <- out.name
results
}
allsubs(dat, c("catA","catB","catc"), func, out.name="mean")
> head(allsubs(dat, vars, func, out.name = "mean"),20)
catA catB catC mean
1 a 1 d 56.65909
2 a 2 d 54.98116
3 a 3 d 37.52655
4 a 4 d 58.29034
5 b 1 e 52.88945
6 b 2 e 50.43122
7 b 3 e 52.57115
8 b 4 e 59.45348
9 c 1 f 52.41637
10 c 2 f 34.58122
11 c 3 f 46.80256
12 c 4 f 51.58668
13 <NA> 1 d 56.65909
14 <NA> 1 e 52.88945
15 <NA> 1 f 52.41637
16 <NA> 2 d 54.98116
17 <NA> 2 e 50.43122
18 <NA> 2 f 34.58122
19 <NA> 3 d 37.52655
20 <NA> 3 e 52.57115
# Find all possible subsets of your data
combVars <- c("catA", "catB", "catC")
subsets <- lapply(0:length(combVars), combn, x = combVars, simplify = FALSE)
subsets <- do.call(c, subsets)
# Calculate means by each subset
meanValues <- lapply(subsets, function(x) aggregate(dat[["obs"]], by = dat[x], FUN = mean))
# Pull them all into one dataframe
Reduce(function(x,y) merge(x,y,all=TRUE), meanValues)