从R中的多个lm（）快速检索Pvalue_R_Statistics

从R中的多个lm（）快速检索Pvalue

r statistics

从R中的多个lm（）快速检索Pvalue,r,statistics,R,Statistics,我有一个带有dims“13，20000000”和以下组的矩阵（mat） [1,] "wildtype" [2,] "wildtype" [3,] "wildtype" [4,] "wildtype" [5,] "wildtype" [6,] "wildtype" [7,] "wildtype" [8,] "wildtype" [9,] "wildtype" [10,] "wildtype" [11,] "mutant" [12,] "mutant"

我有一个带有dims“13，20000000”和以下组的矩阵（mat）

[1,] "wildtype"  
[2,] "wildtype"  
[3,] "wildtype"  
[4,] "wildtype"  
[5,] "wildtype"  
[6,] "wildtype"  
[7,] "wildtype"  
[8,] "wildtype"  
[9,] "wildtype"  
[10,] "wildtype"  
[11,] "mutant"    
[12,] "mutant"   
[13,] "mutant"

使用下面的R代码，我在每个数据点上运行

lm（）

20M次

lm（mat~groups）

速度非常快。使用

summary（lm1）

提取每个模型的pvalue需要很长时间

如何加快提取pvalues的速度

tvals_out <-'/tmp/tvals_lm.csv'

infile <- '/tmp/tempdata.dat'
con <- file(infile, "rb")
dim <- readBin(con, "integer", 2)
mat <- matrix( readBin(con, "numeric", prod(dim)), dim[1], dim[2])
close(con)

groups = factor(c(rep('wt', 10), rep('mut', 3)))
lm1 <- lm(mat ~ groups)

# This is the longest running bit
sum_lm1 <- summary(lm1)

num_pixels <- dim(mat)[2]

result_pvalues <- numeric(num_pixels)

result_pvalues <- vapply(sum_lm1, function(x) x$coefficients[,4][2], FUN.VALUE = 1)

write.table(result_pvalues, tvals_out, sep=',');


outCon <- file(tvals_out, "wb")
writeBin(result_pvalues, outCon)
close(outCon)

tvals\u out试试这个软件包怎么样
install.packages(broom)
library(broom)

tidy(lm(mat ~ groups))
#    response        term   estimate std.error  statistic      p.value
# 1        Y1 (Intercept)  27.000000  7.967548  3.3887465 6.048267e-03
# 2        Y1    groupswt  14.900000  9.084402  1.6401740 1.292246e-01
# 3        Y2 (Intercept)  23.333333  7.809797  2.9877004 1.234835e-02
# 4        Y2    groupswt  11.366667  8.904539  1.2765026 2.280689e-01
# 5        Y3 (Intercept)  44.000000 17.192317  2.5592828 2.655251e-02
# ...and more...

然后只提取组的结果（注意：实现此目的的各种方法…）：
我有一个脚本，我做了一系列的回归，然后收集系数，包括p值。这是它的样子
library(data.table)
summ<-summary(lm1)$coefficients
coeffs<-data.table(summ)
coeffs[,coef:=row.names(summ)]
setnames(coeffs,c("estimate", "stderr","t","p","coef"))

库（data.table）
summ我很难想象什么会比summary
更快。为了尝试，我写了一个快速的小玩意，从系数和标准误差计算p值。我还尝试了broom
方法。基于样本数据的结果如下所示
m <- c(28, 28, 28, 29, 33, 39, 49, 58, 63,64,30, 27, 24, 20, 17, 19, 33, 49, 56,57,36, 32, 28, 23, 20, 27, 48, 77, 96, 103,27, 26, 26, 23, 21, 23, 33, 46, 53,52,24, 20, 17, 13, 11, 14, 33, 47, 40,32,40, 46, 49, 48, 44, 49, 57, 59, 61,53,22, 24, 26, 32, 38, 39, 44, 53, 59,58,16, 16, 14, 10,7, 14, 34, 55, 62,61,28, 25, 21, 19, 22, 32, 45, 58, 64,61,28, 26, 21, 16, 14, 19, 33, 50, 59,59,17, 16, 15, 14, 17, 25, 38, 54, 61,58,11, 11, 12, 13, 16, 23, 34, 46, 51,45,22, 21, 20, 19, 16, 18, 32, 51, 50,38)

mat <- matrix(m, nrow=13)
groups <- rep(c("wildtype", "mutant"), times = c(10, 3))

fit <- lm(mat ~ groups)


#* Using summary
do.call("cbind", lapply(summary(fit), function(f) coef(f)[, 4]))


#* Directly calculating p-value
pvalOnly <- function(fit){
  pt(abs(coef(fit) / sqrt(diag(vcov(fit)))), 
     df = fit$df.residual, 
     lower.tail = FALSE) * 2
}

pvalDirect <- pvalOnly(fit)


#* Using broom
library(broom)
tidy(fit)$p.value



library(microbenchmark)
microbenchmark(
  summary = do.call("cbind", lapply(summary(fit), function(f) coef(f)[, 4])),
  direct = pvalOnly(fit),
  broom = tidy(fit)$p.value
)

以下函数能够在大约25秒内从13x20000000矩阵（如您的矩阵）的拟合中提取p值
pvalOnly2 <- function(fit) {
    # get estimates
    est <- fit$coefficients[fit$qr$pivot, ]

    # get R: see stats:::summary.lm to see how this is calculated
    p1 <- 1L:(fit$rank)
    R <- diag(chol2inv(fit$qr$qr[p1, p1, drop = FALSE]))

    # get residual sum of squares for each
    resvar <- colSums(fit$residuals^2) / fit$df.residual
    # R is same for each coefficient, resvar is same within each model 
    se <- sqrt(outer(R, resvar))

    pt(abs(est / se), df = fit$df.residual, lower.tail = FALSE) * 2
}

结果如下：
Unit: microseconds
           expr      min       lq     mean    median        uq      max neval cld
        summary 3383.085 3702.238 3978.110 3919.0755 4147.4015 5475.223   100   b
 pvalOnly2(fit)   81.538   91.541  136.903  137.1275  157.5535  459.415   100  a 

但是，当您安装的车型越多，速度优势就越大。在13x1000的矩阵上，它有大约300倍的优势。在我的机器上，当有2000万列时，它会在25秒内计算出p值，速度是拟合速度的两倍，如果你能发布一个非常小的2000万数据点样本（如5-10），那么我们就可以了解你有什么值，数据集是什么样子，并处理数据。@AntoniosK，我从矩阵中添加了10个数据点。谢谢我肯定会选择@JasonAizkalns解决方案，因为我是一个扫帚迷
并且会在第一时间尝试它，但是我不确定当它为您的2000万个案例创建一个结果的数据帧时会有多快。希望它能起作用！扫帚看起来不错。我将来可能会用它。不过，它似乎有点慢，这要感谢对不同方法进行基准测试。我在一个大数据集上重复了一遍，发现直接法比使用摘要法要快一些（~2%）。也许我需要从R移到c或者numpyYour的pvalOnly
函数正在调用stats:：：vcov.mlm
，它实际上调用了summary.mlm本身，所以难怪它并不比调用summary快！我在下面的答案中展示了一种直接计算p值的替代方法，在这个例子中，该方法速度快30倍，可以在大约25秒内完成20000000个案例。哇。令人惊叹的。用户522469。我建议接受大卫的答案而不是我的。@Benjamin我接受了大卫的答案，但非常感谢你的帮助，这看起来很有希望。今天晚些时候我会试试。它确实比我以前做的要快几个数量级。非常感谢。如果使用多个固定效果，是否也可以提取整个模型的p值？@user521469是-我已将此类函数编辑到我的答案中。请注意，您还可以让这些函数返回标准误差、t-统计量、f-统计量等（因为它们是在计算p值的过程中计算出来的）。非常感谢。这是一个巨大的挑战help@user521469很高兴我能帮忙
Unit: milliseconds
    expr      min       lq     mean   median       uq      max neval cld
 summary 1.685857 1.744652 1.969350 1.804914 1.877931 4.929129   100  a 
  direct 1.860630 1.933501 2.184573 2.047279 2.160765 6.442852   100  a 
   broom 5.303015 5.557257 6.060014 5.818830 5.999028 9.879372   100   b

pvalOnly2 <- function(fit) {
    # get estimates
    est <- fit$coefficients[fit$qr$pivot, ]

    # get R: see stats:::summary.lm to see how this is calculated
    p1 <- 1L:(fit$rank)
    R <- diag(chol2inv(fit$qr$qr[p1, p1, drop = FALSE]))

    # get residual sum of squares for each
    resvar <- colSums(fit$residuals^2) / fit$df.residual
    # R is same for each coefficient, resvar is same within each model 
    se <- sqrt(outer(R, resvar))

    pt(abs(est / se), df = fit$df.residual, lower.tail = FALSE) * 2
}

m <- c(28, 28, 28, 29, 33, 39, 49, 58, 63,64,30, 27, 24, 20, 17, 19, 33, 49, 56,57,36, 32, 28, 23, 20, 27, 48, 77, 96, 103,27, 26, 26, 23, 21, 23, 33, 46, 53,52,24, 20, 17, 13, 11, 14, 33, 47, 40,32,40, 46, 49, 48, 44, 49, 57, 59, 61,53,22, 24, 26, 32, 38, 39, 44, 53, 59,58,16, 16, 14, 10,7, 14, 34, 55, 62,61,28, 25, 21, 19, 22, 32, 45, 58, 64,61,28, 26, 21, 16, 14, 19, 33, 50, 59,59,17, 16, 15, 14, 17, 25, 38, 54, 61,58,11, 11, 12, 13, 16, 23, 34, 46, 51,45,22, 21, 20, 19, 16, 18, 32, 51, 50,38)

mat <- matrix(m, nrow=13)
groups <- rep(c("wildtype", "mutant"), times = c(10, 3))

fit <- lm(mat ~ groups)

library(microbenchmark)
microbenchmark(summary = do.call("cbind", lapply(summary(fit), function(f) coef(f)[, 4])),
               pvalOnly2(fit))

Unit: microseconds
           expr      min       lq     mean    median        uq      max neval cld
        summary 3383.085 3702.238 3978.110 3919.0755 4147.4015 5475.223   100   b
 pvalOnly2(fit)   81.538   91.541  136.903  137.1275  157.5535  459.415   100  a 

> mat <- mat[, rep(1:10, 2e6)]   # just replicating same coefs
> dim(mat)
[1]       13 20000000
> system.time(fit <- lm(mat ~ groups))
   user  system elapsed 
 37.272  10.296  58.372 
> system.time(pvals <- pvalOnly2(fit))
   user  system elapsed 
 21.945   1.889  24.322 

> dim(pvals)
[1]        2 20000000
> pvals[, 1:10]
                      [,1]       [,2]       [,3]         [,4]        [,5]       [,6]
(Intercept)    0.006048267 0.01234835 0.02655251 0.0004555316 0.001004109 0.01608319
groupswildtype 0.129224604 0.22806894 0.88113522 0.2064583345 0.103624361 0.84642990
                       [,7]      [,8]       [,9]        [,10]
(Intercept)    0.0004630405 0.1386393 0.05107805 5.042796e-05
groupswildtype 0.2717139022 0.1539826 0.66351492 5.942893e-02

modelPvalOnly <- function(fit) {
    f <- t(fit$fitted.values)
    if (attr(fit$terms, "intercept"))  {
        mss <- rowSums((f - rowMeans(f)) ^ 2)
        numdf <- fit$rank - 1
    } else {
        mss <- rowSums(f ^ 2)
        numdf <- fit$rank
    }

    resvar <- colSums(fit$residuals^2) / fit$df.residual
    fstat <- (mss / numdf) / resvar
    pval <- pf(fstat, numdf, fit$df.residual, lower.tail = FALSE)
    pval
}