R 一种从重复测量中聚合数据的有效方法
我正在分析一个大型实验的基因表达数据(12400个单细胞和23800个基因),我遇到了效率问题。我将在下面写一个可复制的示例,但我的问题如下: 我将数据集中的小鼠基因转换为人类基因,以便与之前公布的其他数据进行比较。在某些情况下有多个匹配(一个人类基因映射到多个小鼠基因)。在这些情况下,我想平均这些多个基因的表达值,并得出人类基因对应物的一个表达值。我可以通过将表达式数据转换为矩阵格式(允许重复的行名称)并应用R 一种从重复测量中聚合数据的有效方法,r,bigdata,aggregate,R,Bigdata,Aggregate,我正在分析一个大型实验的基因表达数据(12400个单细胞和23800个基因),我遇到了效率问题。我将在下面写一个可复制的示例,但我的问题如下: 我将数据集中的小鼠基因转换为人类基因,以便与之前公布的其他数据进行比较。在某些情况下有多个匹配(一个人类基因映射到多个小鼠基因)。在这些情况下,我想平均这些多个基因的表达值,并得出人类基因对应物的一个表达值。我可以通过将表达式数据转换为矩阵格式(允许重复的行名称)并应用aggregate()函数来实现这一点,但这需要花费很长的时间来处理大型数据集。这里很
aggregate()
函数来实现这一点,但这需要花费很长的时间来处理大型数据集。这里很难举例说明具体情况,但我的模拟分析管道如下:
data <- as.matrix(data.frame(cell1 = c(1,1,1,1,3,3),
cell2 = c(1, 2 ,4 ,10,5,10),
cell3 = c(0,0,0,1,10,20),
cell4 = c(1,3,4,4,20,20)))
# Adding gene names as rownames
rownames(data) <- c("ABC1", "ABC2", "ABC2", "ABC4", "ABC5", "ABC5")
# Mock gene expression matrix
# Columns indicate expression values from individual cells
# Rows indicate genes
data
#> cell1 cell2 cell3 cell4
#> ABC1 1 1 0 1
#> ABC2 1 2 0 3
#> ABC2 1 4 0 4
#> ABC4 1 10 1 4
#> ABC5 3 5 10 20
#> ABC5 3 10 20 20
# Averaging gene expression values where there are multiple measurements for the same gene
aggr_data <- aggregate(data, by=list(rownames(data)), mean)
# End result I'm trying to achieve
aggr_data
#> Group.1 cell1 cell2 cell3 cell4
#> 1 ABC1 1 1.0 0 1.0
#> 2 ABC2 1 3.0 0 3.5
#> 3 ABC4 1 10.0 1 4.0
#> 4 ABC5 3 7.5 15 20.0
数据ABC1 1 0 1
#>ABC2 1 2 0 3
#>ABC2 1 4 0 4
#>ABC411014
#>ABC5 3 5 10 20
#>ABC5 3 10 20 20
#对同一基因进行多个测量时的平均基因表达值
aggr_data使用data.table应该可以很好地工作:
library(data.table)
as.data.table(data)[, lapply(.SD, mean), by = .(rownames(data))]
# rownames cell1 cell2 cell3 cell4
#1: ABC1 1 1.0 0 1.0
#2: ABC2 1 3.0 0 3.5
#3: ABC4 1 10.0 1 4.0
#4: ABC5 3 7.5 15 20.0
快速SO搜索找到了一个链接,以加快分组操作的比较速度(对于大数据,data.table是最快的):
您可以尝试dplyr
summary_all
withmean()
函数提供每组每列的平均值
library(tidyverse) # including dplyr
(df <-
data_frame(
cell1 = c(1,1,1,1,3,3),
cell2 = c(1, 2 ,4 ,10,5,10),
cell3 = c(0,0,0,1,10,20),
cell4 = c(1,3,4,4,20,20),
gene_name = c("ABC1", "ABC2", "ABC2", "ABC4", "ABC5", "ABC5")
))
#> # A tibble: 6 x 5
#> cell1 cell2 cell3 cell4 gene_name
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 1 1 0 1 ABC1
#> 2 1 2 0 3 ABC2
#> 3 1 4 0 4 ABC2
#> 4 1 10 1 4 ABC4
#> 5 3 5 10 20 ABC5
#> 6 3 10 20 20 ABC5
通常,对于您的情况下的大型数据集,data.table
包是合适的:代码如下
setDT(df)[, lapply(.SD, mean), by = gene_name]
#> gene_name cell1 cell2 cell3 cell4
#> 1: ABC1 1 1.0 0 1.0
#> 2: ABC2 1 3.0 0 3.5
#> 3: ABC4 1 10.0 1 4.0
#> 4: ABC5 3 7.5 15 20.0
setDT
仅用于生成data.table
对象
dplyr与data.table
如果绑定数据集
df_bench
#># A tibble: 18,000 x 10,001
#> gene_name cell1 cell2 cell3 cell4 cell5 cell6 cell7
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ABC308 1 1 0 1 1 1 0
#> 2 ABC258 1 2 0 3 1 2 0
#> 3 ABC553 1 4 0 4 1 4 0
#> 4 ABC57 1 10 1 4 1 10 1
#> 5 ABC469 3 5 10 20 3 5 10
#> 6 ABC484 3 10 20 20 3 10 20
#> 7 ABC813 1 1 0 1 1 1 0
#> 8 ABC371 1 2 0 3 1 2 0
#> 9 ABC547 1 4 0 4 1 4 0
#>10 ABC171 1 10 1 4 1 10 1
#># ... with 17,990 more rows, and 9,993 more variables:
#># cell8 <dbl>, cell9 <dbl>, cell10 <dbl>,
#># cell11 <dbl>, cell12 <dbl>, cell13 <dbl>,
#># cell14 <dbl>, cell15 <dbl>, cell16 <dbl>,
#># cell17 <dbl>, cell18 <dbl>, cell19 <dbl>,
#># cell20 <dbl>, cell21 <dbl>, cell22 <dbl>,
#># cell23 <dbl>, cell24 <dbl>, cell25 <dbl>,
#># cell26 <dbl>, cell27 <dbl>, cell28 <dbl>,
#># cell29 <dbl>, cell30 <dbl>, cell31 <dbl>,
#># cell32 <dbl>, cell33 <dbl>, cell34 <dbl>,
#># cell35 <dbl>, cell36 <dbl>, cell37 <dbl>,
#># cell38 <dbl>, cell39 <dbl>, cell40 <dbl>,
#># cell41 <dbl>, cell42 <dbl>, cell43 <dbl>,
#># cell44 <dbl>, cell45 <dbl>, cell46 <dbl>,
#># cell47 <dbl>, cell48 <dbl>, cell49 <dbl>,
#># cell50 <dbl>, cell51 <dbl>, cell52 <dbl>,
#># cell53 <dbl>, cell54 <dbl>, cell55 <dbl>,
#># cell56 <dbl>, cell57 <dbl>, cell58 <dbl>,
#># cell59 <dbl>, cell60 <dbl>, cell61 <dbl>,
#># cell62 <dbl>, cell63 <dbl>, cell64 <dbl>,
#># cell65 <dbl>, cell66 <dbl>, cell67 <dbl>,
#># cell68 <dbl>, cell69 <dbl>, cell70 <dbl>,
#># cell71 <dbl>, cell72 <dbl>, cell73 <dbl>,
#># cell74 <dbl>, cell75 <dbl>, cell76 <dbl>,
#># cell77 <dbl>, cell78 <dbl>, cell79 <dbl>,
#># cell80 <dbl>, cell81 <dbl>, cell82 <dbl>,
#># cell83 <dbl>, cell84 <dbl>, cell85 <dbl>,
#># cell86 <dbl>, cell87 <dbl>, cell88 <dbl>,
#># cell89 <dbl>, cell90 <dbl>, cell91 <dbl>,
#># cell92 <dbl>, cell93 <dbl>, cell94 <dbl>,
#># cell95 <dbl>, cell96 <dbl>, cell97 <dbl>,
#># cell98 <dbl>, cell99 <dbl>, cell100 <dbl>,
#># cell101 <dbl>, cell102 <dbl>, cell103 <dbl>,
#># cell104 <dbl>, cell105 <dbl>, cell106 <dbl>,
#># cell107 <dbl>, …
data.table
似乎比这里的dplyr
快。谢谢您的回答。不知怎的,我错过了你在我的搜索过程中共享的链接。那里有非常好的信息!谢谢你在这里的详细解释。竖起大拇指!
df_bench
#># A tibble: 18,000 x 10,001
#> gene_name cell1 cell2 cell3 cell4 cell5 cell6 cell7
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ABC308 1 1 0 1 1 1 0
#> 2 ABC258 1 2 0 3 1 2 0
#> 3 ABC553 1 4 0 4 1 4 0
#> 4 ABC57 1 10 1 4 1 10 1
#> 5 ABC469 3 5 10 20 3 5 10
#> 6 ABC484 3 10 20 20 3 10 20
#> 7 ABC813 1 1 0 1 1 1 0
#> 8 ABC371 1 2 0 3 1 2 0
#> 9 ABC547 1 4 0 4 1 4 0
#>10 ABC171 1 10 1 4 1 10 1
#># ... with 17,990 more rows, and 9,993 more variables:
#># cell8 <dbl>, cell9 <dbl>, cell10 <dbl>,
#># cell11 <dbl>, cell12 <dbl>, cell13 <dbl>,
#># cell14 <dbl>, cell15 <dbl>, cell16 <dbl>,
#># cell17 <dbl>, cell18 <dbl>, cell19 <dbl>,
#># cell20 <dbl>, cell21 <dbl>, cell22 <dbl>,
#># cell23 <dbl>, cell24 <dbl>, cell25 <dbl>,
#># cell26 <dbl>, cell27 <dbl>, cell28 <dbl>,
#># cell29 <dbl>, cell30 <dbl>, cell31 <dbl>,
#># cell32 <dbl>, cell33 <dbl>, cell34 <dbl>,
#># cell35 <dbl>, cell36 <dbl>, cell37 <dbl>,
#># cell38 <dbl>, cell39 <dbl>, cell40 <dbl>,
#># cell41 <dbl>, cell42 <dbl>, cell43 <dbl>,
#># cell44 <dbl>, cell45 <dbl>, cell46 <dbl>,
#># cell47 <dbl>, cell48 <dbl>, cell49 <dbl>,
#># cell50 <dbl>, cell51 <dbl>, cell52 <dbl>,
#># cell53 <dbl>, cell54 <dbl>, cell55 <dbl>,
#># cell56 <dbl>, cell57 <dbl>, cell58 <dbl>,
#># cell59 <dbl>, cell60 <dbl>, cell61 <dbl>,
#># cell62 <dbl>, cell63 <dbl>, cell64 <dbl>,
#># cell65 <dbl>, cell66 <dbl>, cell67 <dbl>,
#># cell68 <dbl>, cell69 <dbl>, cell70 <dbl>,
#># cell71 <dbl>, cell72 <dbl>, cell73 <dbl>,
#># cell74 <dbl>, cell75 <dbl>, cell76 <dbl>,
#># cell77 <dbl>, cell78 <dbl>, cell79 <dbl>,
#># cell80 <dbl>, cell81 <dbl>, cell82 <dbl>,
#># cell83 <dbl>, cell84 <dbl>, cell85 <dbl>,
#># cell86 <dbl>, cell87 <dbl>, cell88 <dbl>,
#># cell89 <dbl>, cell90 <dbl>, cell91 <dbl>,
#># cell92 <dbl>, cell93 <dbl>, cell94 <dbl>,
#># cell95 <dbl>, cell96 <dbl>, cell97 <dbl>,
#># cell98 <dbl>, cell99 <dbl>, cell100 <dbl>,
#># cell101 <dbl>, cell102 <dbl>, cell103 <dbl>,
#># cell104 <dbl>, cell105 <dbl>, cell106 <dbl>,
#># cell107 <dbl>, …
microbenchmark::microbenchmark(
DPLYR = {
df_bench %>%
group_by(gene_name) %>%
summarise_all(mean)
},
DATATABLE = {
setDT(df_bench)[, lapply(.SD, mean), by = gene_name]
},
times = 50
)
#> Unit: seconds
#> expr min lq mean median uq max neval
#> DPLYR 32.82307 34.89050 38.10948 37.44543 40.01937 47.67549 50
#> DATATABLE 12.16752 13.59018 16.09665 14.25976 15.60752 40.30257 50