R 存储单元行和每个存储单元计算离散度并返回异常值_R

R 存储单元行和每个存储单元计算离散度并返回异常值

R 存储单元行和每个存储单元计算离散度并返回异常值,r,R,我有一个data.frame，它相当大，有12374行（基因）和785列（细胞）。我想根据rowMeans将行分组到20个箱子中。在每个仓位中，我想对该仓位中所有基因的离散度（方差/均值）进行z-归一化，以便识别异常基因，即使与具有相似平均表达的基因相比，其表达值也是高度可变的。然后，我想提取出超过z评分阈值1.7的基因，以从每个bin中识别出显著可变的基因我的数据如下所示： > head(temp[,1:5]) Cell1

我有一个data.frame，它相当大，有12374行（基因）和785列（细胞）。我想根据

rowMeans

将行分组到20个箱子中。在每个仓位中，我想对该仓位中所有基因的离散度（方差/均值）进行z-归一化，以便识别异常基因，即使与具有相似平均表达的基因相比，其表达值也是高度可变的。然后，我想提取出超过z评分阈值1.7的基因，以从每个bin中识别出显著可变的基因

我的数据如下所示：

> head(temp[,1:5])
                         Cell1                Cell2                 Cell3              Cell4                 Cell5
0610007P14RIK            0.1439444            0.0000000             0.000000            0.8759335            0.0000000
0610009B22RIK            0.0000000            0.6776718             0.000000            0.0000000            0.0000000
0610009O20RIK            0.1439444            0.0000000             0.000000            0.2735741            0.0000000
0610010B08RIK            1.4769893            1.1369215             1.124842            0.8759335            1.9544187
0610010F05RIK            0.7944809            0.0000000             0.000000            0.7016789            0.9144108
0610010K14RIK            0.1439444            0.0000000             1.124842            0.7016789            0.0000000

我曾尝试使用

dplyr

来执行此操作，但遇到了与（我认为是）垃圾箱数量相关的错误。这是我的尝试：

library(dplyr)
library(genefilter)
n_bins = 20
temp = data
temp$dispersion = rowMeans(temp)/rowVars(temp)
outscore = temp %>% mutate(bin=ntile(dispersion,n_bins)) %>% 
  group_by(bin) %>% mutate(zscore=scale(dispersion),outlier=abs(zscore)>1.7)

返回的错误为：dims[product 619]与对象[618]的长度不匹配修改后的：这里有一个基于R的解决方案，需要dplyr的帮助：

library(dplyr)

# I called the data set 'mydata'
colnames(mydata)[1]<-"ID"
a<-which(colnames(mydata)== "ID")

##from: http://www.inside-r.org/packages/cran/metaMA/docs/rowVars

rowVars<-function (x,na.rm = TRUE) 
  {
    sqr = function(x) x * x
    n = rowSums(!is.na(x))
    n[n <= 1] = NA
    return(rowSums(sqr(x - rowMeans(x,na.rm = na.rm)), na.rm = na.rm)/(n - 1))
  }

mydata$dispersion<-rowMeans(mydata[,-a])/rowVars(mydata[,-a])
nbins = 2 # for you, use 20, or however many you want.
mydata$bin<-ntile(mydata$dispersion, nbins)


b<-which(colnames(mydata)== "bin")
temp<-NULL
mydata$Z<-0

for(i in unique(mydata$bin)){
    temp<-mydata[mydata$bin == i, -c(a,b)]$dispersion
    temp<-(temp-mean(temp))/sd(temp)
    mydata[mydata$bin == i, -c(a,b)]$Z<-temp
  }

mydata$outlier<-ifelse(abs(mydata$Z) > 1.7, 1, 0)
mydata.small<-mydata[,c(1,7:10)] ##for display purposes
mydata.small

           ID dispersion bin          Z outlier
0610007P14RIK   1.406851   1 -0.9370254       0
0610009B22RIK   1.475641   1 -0.1158566       0
0610009O20RIK   5.502857   2  0.1333542       0
0610010B08RIK   7.553503   2  0.9266318       0
0610010F05RIK   2.418036   2 -1.0599860       0
0610010K14RIK   1.573546   1  1.0528820       0

库（dplyr）
#我将数据集称为“mydata”
colnames（mydata）[1]修订版：下面是一个用base R编写的解决方案，它得到了dplyr
的一些帮助：
library(dplyr)

# I called the data set 'mydata'
colnames(mydata)[1]<-"ID"
a<-which(colnames(mydata)== "ID")

##from: http://www.inside-r.org/packages/cran/metaMA/docs/rowVars

rowVars<-function (x,na.rm = TRUE) 
  {
    sqr = function(x) x * x
    n = rowSums(!is.na(x))
    n[n <= 1] = NA
    return(rowSums(sqr(x - rowMeans(x,na.rm = na.rm)), na.rm = na.rm)/(n - 1))
  }

mydata$dispersion<-rowMeans(mydata[,-a])/rowVars(mydata[,-a])
nbins = 2 # for you, use 20, or however many you want.
mydata$bin<-ntile(mydata$dispersion, nbins)


b<-which(colnames(mydata)== "bin")
temp<-NULL
mydata$Z<-0

for(i in unique(mydata$bin)){
    temp<-mydata[mydata$bin == i, -c(a,b)]$dispersion
    temp<-(temp-mean(temp))/sd(temp)
    mydata[mydata$bin == i, -c(a,b)]$Z<-temp
  }

mydata$outlier<-ifelse(abs(mydata$Z) > 1.7, 1, 0)
mydata.small<-mydata[,c(1,7:10)] ##for display purposes
mydata.small

           ID dispersion bin          Z outlier
0610007P14RIK   1.406851   1 -0.9370254       0
0610009B22RIK   1.475641   1 -0.1158566       0
0610009O20RIK   5.502857   2  0.1333542       0
0610010B08RIK   7.553503   2  0.9266318       0
0610010F05RIK   2.418036   2 -1.0599860       0
0610010K14RIK   1.573546   1  1.0528820       0    

库（dplyr）
#我将数据集称为“mydata”
colnames（mydata）[1]