Python 计算dataframe中特定列(SUM、AVG、STDEV)的所有嵌套级别聚合
我有一张如下所示的表格(简化): 我想得到一个包含每个列a级别的所有嵌套组合的表,并计算(比如)子组内的平均值:例如,choose-any-2表看起来像(10个唯一的级别组合): 选择-any-4看起来像(5个独特的级别组合): (优先顺序)R,SQL(postgres,ANSI),Python。; 我在R中的当前解决方案(如下)不能很好地扩展Python 计算dataframe中特定列(SUM、AVG、STDEV)的所有嵌套级别聚合,python,r,aggregate-functions,Python,R,Aggregate Functions,我有一张如下所示的表格(简化): 我想得到一个包含每个列a级别的所有嵌套组合的表,并计算(比如)子组内的平均值:例如,choose-any-2表看起来像(10个唯一的级别组合): 选择-any-4看起来像(5个独特的级别组合): (优先顺序)R,SQL(postgres,ANSI),Python。; 我在R中的当前解决方案(如下)不能很好地扩展col\u A的级别数: require(tidyverse) df <- tibble(col_A=c("A", "B","C", "D", "E
col\u A
的级别数:
require(tidyverse)
df <- tibble(col_A=c("A", "B","C", "D", "E"), col_B=c(37,28,10,11,99), col_C=c(2,7,5,5,4))
nested_subgroup_agg <- function(choice = 2, mydf = NULL) {
library(tidyverse)
dfx <-
combn(c("A", "B", "C", "D", "E"), choice) %>%
t() %>%
as_tibble()
try(if (choice <= 1) {
stop("Can't Choose less than 2 levels at a time")
}
else{
if (choice == 2) {
val <- map_dbl(1:nrow(dfx), function(i) {
(mydf$col_B[mydf$col_A == dfx$V1[i]] + mydf$col_B[mydf$col_A == dfx$V2[i]]) /
(mydf$col_C[mydf$col_A == dfx$V1[i]] + mydf$col_C[mydf$col_A == dfx$V2[i]])
})
}
else{
if (choice == 3) {
val <- map_dbl(1:nrow(dfx), function(i) {
(mydf$col_B[mydf$col_A == dfx$V1[i]] + mydf$col_B[mydf$col_A == dfx$V2[i]] + mydf$col_B[mydf$col_A == dfx$V3[i]]) /
(mydf$col_C[mydf$col_A == dfx$V1[i]] + mydf$col_C[mydf$col_A == dfx$V2[i]] + mydf$col_C[mydf$col_A == dfx$V3[i]])
})
}
else{
if (choice == 4) {
val <- map_dbl(1:nrow(dfx), function(i) {
(mydf$col_B[mydf$col_A == dfx$V1[i]] + mydf$col_B[mydf$col_A == dfx$V2[i]] + mydf$col_B[mydf$col_A == dfx$V3[i]] + mydf$col_B[mydf$col_A == dfx$V4[i]]) /
(mydf$col_C[mydf$col_A == dfx$V1[i]] + mydf$col_C[mydf$col_A == dfx$V2[i]] + mydf$col_C[mydf$col_A == dfx$V3[i]] + mydf$col_C[mydf$col_A == dfx$V4[i]])
})
}
}
}
})
dfx$val <- val
dfx
}
## Example
df <-
tibble(
col_A = c("A", "B", "C", "D", "E"),
col_B = c(37, 28, 10, 11, 99),
col_C = c(2, 7, 5, 5, 4)
)
nested_subgroup_agg(choice = 4, mydf = df)
require(tidyverse)
df一个想法是使用combn
获得所有行的组合(考虑到每行有1个字母),然后简单地每2行聚合一次,即
#get a df with all combination of rows
new_d <- dd[c(combn(nrow(dd), 2)),]
#Aggregate
#You can use `aggregate` or `lapply(split())`
lapply(split(new_d, rep(seq((nrow(new_d)) / 2), each = 2)), function(i)sum(i$col_C))
使用data.table的选项:
nested_subgroup_agg <- function(choice=2, mydf) {
ans <- setDT(mydf)[.(g=rep(seq(choose(.N, choice)), each=choice), col_A=c(combn(col_A, choice))), on=.(col_A)][,
.(toString(col_A), sum(col_B) / sum(col_C)), g]
setnames(ans, names(ans)[-1L], c(paste0("Grp_", choice), "val"))[]
}
nested_subgroup_agg(3, DT)
数据:
库(data.table)
我删除了SQL标记,因为您的问题是关于R中的数据帧。
require(tidyverse)
df <- tibble(col_A=c("A", "B","C", "D", "E"), col_B=c(37,28,10,11,99), col_C=c(2,7,5,5,4))
nested_subgroup_agg <- function(choice = 2, mydf = NULL) {
library(tidyverse)
dfx <-
combn(c("A", "B", "C", "D", "E"), choice) %>%
t() %>%
as_tibble()
try(if (choice <= 1) {
stop("Can't Choose less than 2 levels at a time")
}
else{
if (choice == 2) {
val <- map_dbl(1:nrow(dfx), function(i) {
(mydf$col_B[mydf$col_A == dfx$V1[i]] + mydf$col_B[mydf$col_A == dfx$V2[i]]) /
(mydf$col_C[mydf$col_A == dfx$V1[i]] + mydf$col_C[mydf$col_A == dfx$V2[i]])
})
}
else{
if (choice == 3) {
val <- map_dbl(1:nrow(dfx), function(i) {
(mydf$col_B[mydf$col_A == dfx$V1[i]] + mydf$col_B[mydf$col_A == dfx$V2[i]] + mydf$col_B[mydf$col_A == dfx$V3[i]]) /
(mydf$col_C[mydf$col_A == dfx$V1[i]] + mydf$col_C[mydf$col_A == dfx$V2[i]] + mydf$col_C[mydf$col_A == dfx$V3[i]])
})
}
else{
if (choice == 4) {
val <- map_dbl(1:nrow(dfx), function(i) {
(mydf$col_B[mydf$col_A == dfx$V1[i]] + mydf$col_B[mydf$col_A == dfx$V2[i]] + mydf$col_B[mydf$col_A == dfx$V3[i]] + mydf$col_B[mydf$col_A == dfx$V4[i]]) /
(mydf$col_C[mydf$col_A == dfx$V1[i]] + mydf$col_C[mydf$col_A == dfx$V2[i]] + mydf$col_C[mydf$col_A == dfx$V3[i]] + mydf$col_C[mydf$col_A == dfx$V4[i]])
})
}
}
}
})
dfx$val <- val
dfx
}
## Example
df <-
tibble(
col_A = c("A", "B", "C", "D", "E"),
col_B = c(37, 28, 10, 11, 99),
col_C = c(2, 7, 5, 5, 4)
)
nested_subgroup_agg(choice = 4, mydf = df)
#get a df with all combination of rows
new_d <- dd[c(combn(nrow(dd), 2)),]
#Aggregate
#You can use `aggregate` or `lapply(split())`
lapply(split(new_d, rep(seq((nrow(new_d)) / 2), each = 2)), function(i)sum(i$col_C))
dput(dd)
structure(list(col_A = structure(1:5, .Label = c("A", "B", "C",
"D", "E"), class = "factor"), col_B = c(37L, 28L, 10L, 11L, 99L
), col_C = c(2L, 7L, 5L, 5L, 4L)), class = "data.frame", row.names = c(NA,
-5L))
nested_subgroup_agg <- function(choice=2, mydf) {
ans <- setDT(mydf)[.(g=rep(seq(choose(.N, choice)), each=choice), col_A=c(combn(col_A, choice))), on=.(col_A)][,
.(toString(col_A), sum(col_B) / sum(col_C)), g]
setnames(ans, names(ans)[-1L], c(paste0("Grp_", choice), "val"))[]
}
nested_subgroup_agg(3, DT)
g Grp_3 val
1: 1 A, B, C 5.357143
2: 2 A, B, D 5.428571
3: 3 A, B, E 12.615385
4: 4 A, C, D 4.833333
5: 5 A, C, E 13.272727
6: 6 A, D, E 13.363636
7: 7 B, C, D 2.882353
8: 8 B, C, E 8.562500
9: 9 B, D, E 8.625000
10: 10 C, D, E 8.571429
library(data.table)
DT <- fread("col_A col_B col_C
A 37 2
B 28 7
C 10 5
D 11 5
E 99 4")