用dplyr迭代t检验

用dplyr迭代t检验,r,dplyr,R,Dplyr,我试图找到一种优雅的方法来进行t检验,比较6组数据的平均值,最好使用dplyr/tidyverse。我的数据大致如下: df %>% group_by(grouping_variable)%>% t.test(of each on each) 分组变量数值变量 A 5.6 A 2.3 A 4.8 B 7.3 B 6.9 B 5.8 C 1.4 C 6.4 我知道我可以做一些事情,比如: df_a <- df %>% filter(grouping_variable ==

我试图找到一种优雅的方法来进行t检验,比较6组数据的平均值,最好使用dplyr/tidyverse。我的数据大致如下:

df %>% group_by(grouping_variable)%>%
t.test(of each on each)
分组变量数值变量

A 5.6

A 2.3

A 4.8

B 7.3

B 6.9

B 5.8

C 1.4

C 6.4

我知道我可以做一些事情,比如:

df_a <- df %>% filter(grouping_variable == 'A')
df_b <- df %>% filter(grouping_variable == 'B')
a_b <- t.test(df_a,df_b)$p.value
pairwise.ttest <- pairwise.t.test(x = df$num_var, g = df$group_var)
pairwise.ttest$p.value
也许是整洁的

我的最终结果是得到了一个沿着

A B C D E F

A.34.4.235

B.03.34.454…

检查此解决方案:

library(tidyverse)
library(magrittr)

df %$% 
crossing(
  gr1 = grouping_variable %>% unique(),
  gr2 = grouping_variable %>% unique()
) %>%
  filter(gr1 != gr2) %>%
  left_join(
    df %>%
      group_by(grouping_variable) %>%
      nest() %>%
      rename_all(~c('gr1', 'data1'))
  ) %>%
  left_join(
    df %>%
      group_by(grouping_variable) %>%
      nest() %>%
      rename_all(~c('gr2', 'data2'))
  ) %>%
  mutate(p_val = map2_dbl(
      data1, data2,
      ~t.test(
        .x$numerical_variable,
        .y$numerical_variable
      )$p.value
    )
  )
首先,一些数据:

df <-
  data_frame(
    Group = rep(LETTERS[1:8], each = 10)
    , Value = rnorm(80)
  )
这看起来像这样:

# A tibble: 56 x 6
   group_a group_b  mean_a  mean_b     diff   pval
   <chr>   <chr>     <dbl>   <dbl>    <dbl>  <dbl>
 1 A       B       -0.275   0.0851 -0.360   0.384 
 2 A       C       -0.275  -0.651   0.376   0.406 
 3 A       D       -0.275  -0.440   0.165   0.737 
 4 A       E       -0.275   0.336  -0.611   0.245 
 5 A       F       -0.275  -0.277   0.00233 0.996 
 6 A       G       -0.275  -0.115  -0.160   0.754 
 7 A       H       -0.275  -0.406   0.131   0.821 
 8 B       A        0.0851 -0.275   0.360   0.384 
 9 B       C        0.0851 -0.651   0.736   0.0748
10 B       D        0.0851 -0.440   0.525   0.245 
# ... with 46 more rows

您还可以分散结果以生成您要查找的表:

t_tests_out %>%
  select(group_a, group_b, pval) %>%
  spread(group_b, pval)
返回

# A tibble: 8 x 9
  group_a      A       B       C      D       E      F      G      H
  <chr>    <dbl>   <dbl>   <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
1 A       NA      0.384   0.406   0.737  0.245   0.996  0.754  0.821
2 B        0.384 NA       0.0748  0.245  0.595   0.439  0.668  0.371
3 C        0.406  0.0748 NA       0.659  0.0632  0.456  0.291  0.668
4 D        0.737  0.245   0.659  NA      0.163   0.762  0.547  0.955
5 E        0.245  0.595   0.0632  0.163 NA       0.280  0.425  0.243
6 F        0.996  0.439   0.456   0.762  0.280  NA      0.770  0.835
7 G        0.754  0.668   0.291   0.547  0.425   0.770 NA      0.640
8 H        0.821  0.371   0.668   0.955  0.243   0.835  0.640 NA    
#一个tible:8 x 9
组a B C D E F G H
1 A NA 0.384 0.406 0.737 0.245 0.996 0.754 0.821
2 B 0.384 NA 0.0748 0.245 0.595 0.439 0.668 0.371
3 C 0.406 0.0748 NA 0.659 0.0632 0.456 0.291 0.668
4 D 0.737 0.245 0.659 NA 0.163 0.762 0.547 0.955
5 E 0.245 0.595 0.0632 0.163 NA 0.280 0.425 0.243
6 F 0.996 0.439 0.456 0.762 0.280 NA 0.770 0.835
7 G 0.754 0.668 0.291 0.547 0.425 0.770 NA 0.640
8小时0.821 0.371 0.668 0.955 0.243 0.835 0.640纳

这可以使用purrr的
交叉
映射
功能干净地完成

样本数据:

df <- tibble(group_var = rep(c("A", "B", "C"), times = 5), 
         num_var = rnorm(15))
df
# A tibble: 15 x 2
   group_var num_var
   <chr>       <dbl>
 1 A          1.66  
 2 B         -0.694 
 3 C         -0.680 
 4 A          1.96  
 5 B         -0.380 
 6 C         -0.941 
 7 A          1.02  
 8 B          0.0476
 9 C          0.770 
10 A          1.41  
11 B          0.137 
12 C         -0.816 
13 A         -0.478 
14 B          0.374 
15 C         -0.619 
添加具有测试结果的列:

test_results <- test_results %>% 
  mutate(ttest = map2_dbl(var1, var2, 
                          ~ t.test(df %>% filter(group_var == .x) %>% .$num_var,
                                   df %>% filter(group_var == .y) %>% .$num_var)$p.value))

 test_results %>% 
  spread(var2, ttest)
  var1       A      B      C
  <chr>  <dbl>  <dbl>  <dbl>
1 A     1      0.0436 0.0197
2 B     0.0436 1      0.367 
3 C     0.0197 0.367  1   
测试结果%
突变(ttest=map2_dbl(var1,var2,
~t.test(df%>%filter(group\u var==.x)%>%。$num\u var,
df%>%过滤器(组变量==.y)%>%。$num\u变量)$p.value))
测试结果%>%
排列(变量2,t测试)
var1 A B C
1A 10.0436 0.0197
2 B 0.0436 1 0.367
3 C 0.0197 0.367 1
如果将t.test包装在函数中,则更易于阅读:

ttester <- function(v1, v2) {
  t <- t.test(df %>% filter(group_var == v1) %>% .$num_var,
              df %>% filter(group_var == v2) %>% .$num_var)
  t$p.value
}

cross_df(list(var1 = c("A", "B", "C"), var2 = c("A", "B", "C"))) %>% 
  mutate(ttest = map2_dbl(var1, var2, ~ttester(.x, .y))) %>% 
  spread(var2, ttest)
  var1       A      B      C
  <chr>  <dbl>  <dbl>  <dbl>
1 A     1      0.0436 0.0197
2 B     0.0436 1      0.367 
3 C     0.0197 0.367  1     
ttester%.$num\u var,
df%>%过滤器(组变量==v2)%>%。$num\u变量)
价值新台币
}
交叉df(列表(var1=c(“A”、“B”、“c”)、var2=c(“A”、“B”、“c”)))%>%
突变(ttest=map2_dbl(var1,var2,~ttester(.x,.y)))%>%
排列(变量2,t测试)
var1 A B C
1A 10.0436 0.0197
2 B 0.0436 1 0.367
3 C 0.0197 0.367 1
您正在寻找的。它允许您提及p值调整方法以及替代假设。有关详细信息,请参阅R文档

用法:

对于您的情况,您可以执行以下操作:

df_a <- df %>% filter(grouping_variable == 'A')
df_b <- df %>% filter(grouping_variable == 'B')
a_b <- t.test(df_a,df_b)$p.value
pairwise.ttest <- pairwise.t.test(x = df$num_var, g = df$group_var)
pairwise.ttest$p.value

pairwise.t测试如何将其推广到要传递两个以上参数的情况?大概是
pmap
,但当您不传递数值参数时,这似乎会遇到麻烦。
ttester <- function(v1, v2) {
  t <- t.test(df %>% filter(group_var == v1) %>% .$num_var,
              df %>% filter(group_var == v2) %>% .$num_var)
  t$p.value
}

cross_df(list(var1 = c("A", "B", "C"), var2 = c("A", "B", "C"))) %>% 
  mutate(ttest = map2_dbl(var1, var2, ~ttester(.x, .y))) %>% 
  spread(var2, ttest)
  var1       A      B      C
  <chr>  <dbl>  <dbl>  <dbl>
1 A     1      0.0436 0.0197
2 B     0.0436 1      0.367 
3 C     0.0197 0.367  1     
pairwise.t.test(x, g, p.adjust.method = p.adjust.methods,
            pool.sd = !paired, paired = FALSE,
            alternative = c("two.sided", "less", "greater"),
            ...)
pairwise.ttest <- pairwise.t.test(x = df$num_var, g = df$group_var)
pairwise.ttest$p.value