如何使用dplyr从两个组成对计算列
我有一个这种形状的数据集如何使用dplyr从两个组成对计算列,r,dplyr,purrr,pairwise,R,Dplyr,Purrr,Pairwise,我有一个这种形状的数据集 group a1 a2 ... a9 b1 b2 ... b7 1 1 0 ... 1 0 1 ... 1 1 1 1 ... 1 0 0 ... 1 1 0 0 ... 0 1 0 ... 1 1 1 1 ... 0 1 1 ... 0 2 1 0 ...
group a1 a2 ... a9 b1 b2 ... b7
1 1 0 ... 1 0 1 ... 1
1 1 1 ... 1 0 0 ... 1
1 0 0 ... 0 1 0 ... 1
1 1 1 ... 0 1 1 ... 0
2 1 0 ... 1 0 1 ... 1
2 1 1 ... 1 0 0 ... 1
2 0 0 ... 0 1 0 ... 1
2 1 1 ... 0 1 1 ... 0
...
我要做的是对所有列对应用一个两参数的summary函数,保持数据的分组性质
比如说
f = function(a, b) { mean(a) + mean(b) + mean(a & b) }
将返回类似这样的结果(我实际上不打算计算函数的值,我将只放“x”来指示stat将去哪里,但是对于每个组-a-b组合,它当然是不同的)
一位评论员要求提供一些样本数据。这是:
structure(list(group = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 10L, 10L), a1 = c(0L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
1L, 0L, 0L, 0L), a2 = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L), a3 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,
0L, 0L), a4 = c(0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), a5 = c(1L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L
), b1 = c(1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L), b2 = c(0L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L),
b3 = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-37L))
使用
tidyverse
的解决方案。我们可以根据起始字母两次收集
列,然后进行操作。假设您的数据被称为dat
,dat2
是最终输出
library(tidyverse)
dat2 <- dat %>%
gather(column_a, value_a, starts_with("a")) %>%
gather(column_b, value_b, starts_with("b")) %>%
group_by(group, column_a, column_b) %>%
summarise(stat = mean(value_a) + mean(value_b) + mean(value_a + value_b)) %>%
ungroup()
dat2
# # A tibble: 150 x 4
# group column_a column_b stat
# <int> <chr> <chr> <dbl>
# 1 1 a1 b1 3
# 2 1 a1 b2 2
# 3 1 a1 b3 2
# 4 1 a2 b1 2
# 5 1 a2 b2 1
# 6 1 a2 b3 1
# 7 1 a3 b1 3.5
# 8 1 a3 b2 2.5
# 9 1 a3 b3 2.5
# 10 1 a4 b1 2
# # ... with 140 more rows
库(tidyverse)
dat2%
聚集(列a,值a,以(“a”)开头)%>%
聚集(列b,值b,以“b”开头)%>%
分组依据(分组,a列,b列)%>%
总结(统计=平均值(值a)+平均值(值b)+平均值(值a+值b))%>%
解组()
dat2
##A tible:150 x 4
#组列a列b统计
#
#1 a1 b1 3
#2 1 a1 b2 2
#3 1 a1 b3 2
#4 1 a2 b1 2
#5 1 a2 b2 1
#6 1 a2 b3 1
#7 1 a3 b1 3.5
#8 1 a3 b2 2.5
#9 1 a3 b3 2.5
#10 1 a4 b1 2
# # ... 还有140多行
您可以添加可复制的数据吗?为你创作一些没有意思。我以前从未用过像那样连续收集两次——非常好——谢谢!
library(tidyverse)
dat2 <- dat %>%
gather(column_a, value_a, starts_with("a")) %>%
gather(column_b, value_b, starts_with("b")) %>%
group_by(group, column_a, column_b) %>%
summarise(stat = mean(value_a) + mean(value_b) + mean(value_a + value_b)) %>%
ungroup()
dat2
# # A tibble: 150 x 4
# group column_a column_b stat
# <int> <chr> <chr> <dbl>
# 1 1 a1 b1 3
# 2 1 a1 b2 2
# 3 1 a1 b3 2
# 4 1 a2 b1 2
# 5 1 a2 b2 1
# 6 1 a2 b3 1
# 7 1 a3 b1 3.5
# 8 1 a3 b2 2.5
# 9 1 a3 b3 2.5
# 10 1 a4 b1 2
# # ... with 140 more rows