用dplyr迭代t检验
我试图找到一种优雅的方法来进行t检验,比较6组数据的平均值,最好使用dplyr/tidyverse。我的数据大致如下:用dplyr迭代t检验,r,dplyr,R,Dplyr,我试图找到一种优雅的方法来进行t检验,比较6组数据的平均值,最好使用dplyr/tidyverse。我的数据大致如下: df %>% group_by(grouping_variable)%>% t.test(of each on each) 分组变量数值变量 A 5.6 A 2.3 A 4.8 B 7.3 B 6.9 B 5.8 C 1.4 C 6.4 我知道我可以做一些事情,比如: df_a <- df %>% filter(grouping_variable ==
df %>% group_by(grouping_variable)%>%
t.test(of each on each)
分组变量数值变量
A 5.6
A 2.3
A 4.8
B 7.3
B 6.9
B 5.8
C 1.4
C 6.4
我知道我可以做一些事情,比如:
df_a <- df %>% filter(grouping_variable == 'A')
df_b <- df %>% filter(grouping_variable == 'B')
a_b <- t.test(df_a,df_b)$p.value
pairwise.ttest <- pairwise.t.test(x = df$num_var, g = df$group_var)
pairwise.ttest$p.value
也许是整洁的
我的最终结果是得到了一个沿着
A B C D E F
A.34.4.235
B.03.34.454…检查此解决方案:
library(tidyverse)
library(magrittr)
df %$%
crossing(
gr1 = grouping_variable %>% unique(),
gr2 = grouping_variable %>% unique()
) %>%
filter(gr1 != gr2) %>%
left_join(
df %>%
group_by(grouping_variable) %>%
nest() %>%
rename_all(~c('gr1', 'data1'))
) %>%
left_join(
df %>%
group_by(grouping_variable) %>%
nest() %>%
rename_all(~c('gr2', 'data2'))
) %>%
mutate(p_val = map2_dbl(
data1, data2,
~t.test(
.x$numerical_variable,
.y$numerical_variable
)$p.value
)
)
首先,一些数据:
df <-
data_frame(
Group = rep(LETTERS[1:8], each = 10)
, Value = rnorm(80)
)
这看起来像这样:
# A tibble: 56 x 6
group_a group_b mean_a mean_b diff pval
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A B -0.275 0.0851 -0.360 0.384
2 A C -0.275 -0.651 0.376 0.406
3 A D -0.275 -0.440 0.165 0.737
4 A E -0.275 0.336 -0.611 0.245
5 A F -0.275 -0.277 0.00233 0.996
6 A G -0.275 -0.115 -0.160 0.754
7 A H -0.275 -0.406 0.131 0.821
8 B A 0.0851 -0.275 0.360 0.384
9 B C 0.0851 -0.651 0.736 0.0748
10 B D 0.0851 -0.440 0.525 0.245
# ... with 46 more rows
您还可以分散结果以生成您要查找的表:
t_tests_out %>%
select(group_a, group_b, pval) %>%
spread(group_b, pval)
返回
# A tibble: 8 x 9
group_a A B C D E F G H
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A NA 0.384 0.406 0.737 0.245 0.996 0.754 0.821
2 B 0.384 NA 0.0748 0.245 0.595 0.439 0.668 0.371
3 C 0.406 0.0748 NA 0.659 0.0632 0.456 0.291 0.668
4 D 0.737 0.245 0.659 NA 0.163 0.762 0.547 0.955
5 E 0.245 0.595 0.0632 0.163 NA 0.280 0.425 0.243
6 F 0.996 0.439 0.456 0.762 0.280 NA 0.770 0.835
7 G 0.754 0.668 0.291 0.547 0.425 0.770 NA 0.640
8 H 0.821 0.371 0.668 0.955 0.243 0.835 0.640 NA
#一个tible:8 x 9
组a B C D E F G H
1 A NA 0.384 0.406 0.737 0.245 0.996 0.754 0.821
2 B 0.384 NA 0.0748 0.245 0.595 0.439 0.668 0.371
3 C 0.406 0.0748 NA 0.659 0.0632 0.456 0.291 0.668
4 D 0.737 0.245 0.659 NA 0.163 0.762 0.547 0.955
5 E 0.245 0.595 0.0632 0.163 NA 0.280 0.425 0.243
6 F 0.996 0.439 0.456 0.762 0.280 NA 0.770 0.835
7 G 0.754 0.668 0.291 0.547 0.425 0.770 NA 0.640
8小时0.821 0.371 0.668 0.955 0.243 0.835 0.640纳
这可以使用purrr的交叉
和映射
功能干净地完成
样本数据:
df <- tibble(group_var = rep(c("A", "B", "C"), times = 5),
num_var = rnorm(15))
df
# A tibble: 15 x 2
group_var num_var
<chr> <dbl>
1 A 1.66
2 B -0.694
3 C -0.680
4 A 1.96
5 B -0.380
6 C -0.941
7 A 1.02
8 B 0.0476
9 C 0.770
10 A 1.41
11 B 0.137
12 C -0.816
13 A -0.478
14 B 0.374
15 C -0.619
添加具有测试结果的列:
test_results <- test_results %>%
mutate(ttest = map2_dbl(var1, var2,
~ t.test(df %>% filter(group_var == .x) %>% .$num_var,
df %>% filter(group_var == .y) %>% .$num_var)$p.value))
test_results %>%
spread(var2, ttest)
var1 A B C
<chr> <dbl> <dbl> <dbl>
1 A 1 0.0436 0.0197
2 B 0.0436 1 0.367
3 C 0.0197 0.367 1
测试结果%
突变(ttest=map2_dbl(var1,var2,
~t.test(df%>%filter(group\u var==.x)%>%。$num\u var,
df%>%过滤器(组变量==.y)%>%。$num\u变量)$p.value))
测试结果%>%
排列(变量2,t测试)
var1 A B C
1A 10.0436 0.0197
2 B 0.0436 1 0.367
3 C 0.0197 0.367 1
如果将t.test包装在函数中,则更易于阅读:
ttester <- function(v1, v2) {
t <- t.test(df %>% filter(group_var == v1) %>% .$num_var,
df %>% filter(group_var == v2) %>% .$num_var)
t$p.value
}
cross_df(list(var1 = c("A", "B", "C"), var2 = c("A", "B", "C"))) %>%
mutate(ttest = map2_dbl(var1, var2, ~ttester(.x, .y))) %>%
spread(var2, ttest)
var1 A B C
<chr> <dbl> <dbl> <dbl>
1 A 1 0.0436 0.0197
2 B 0.0436 1 0.367
3 C 0.0197 0.367 1
ttester%.$num\u var,
df%>%过滤器(组变量==v2)%>%。$num\u变量)
价值新台币
}
交叉df(列表(var1=c(“A”、“B”、“c”)、var2=c(“A”、“B”、“c”)))%>%
突变(ttest=map2_dbl(var1,var2,~ttester(.x,.y)))%>%
排列(变量2,t测试)
var1 A B C
1A 10.0436 0.0197
2 B 0.0436 1 0.367
3 C 0.0197 0.367 1
您正在寻找的。它允许您提及p值调整方法以及替代假设。有关详细信息,请参阅R文档
用法:
对于您的情况,您可以执行以下操作:
df_a <- df %>% filter(grouping_variable == 'A')
df_b <- df %>% filter(grouping_variable == 'B')
a_b <- t.test(df_a,df_b)$p.value
pairwise.ttest <- pairwise.t.test(x = df$num_var, g = df$group_var)
pairwise.ttest$p.value
pairwise.t测试如何将其推广到要传递两个以上参数的情况?大概是pmap
,但当您不传递数值参数时,这似乎会遇到麻烦。
ttester <- function(v1, v2) {
t <- t.test(df %>% filter(group_var == v1) %>% .$num_var,
df %>% filter(group_var == v2) %>% .$num_var)
t$p.value
}
cross_df(list(var1 = c("A", "B", "C"), var2 = c("A", "B", "C"))) %>%
mutate(ttest = map2_dbl(var1, var2, ~ttester(.x, .y))) %>%
spread(var2, ttest)
var1 A B C
<chr> <dbl> <dbl> <dbl>
1 A 1 0.0436 0.0197
2 B 0.0436 1 0.367
3 C 0.0197 0.367 1
pairwise.t.test(x, g, p.adjust.method = p.adjust.methods,
pool.sd = !paired, paired = FALSE,
alternative = c("two.sided", "less", "greater"),
...)
pairwise.ttest <- pairwise.t.test(x = df$num_var, g = df$group_var)
pairwise.ttest$p.value