在具有条件的R中使用for循环_R_For Loop_Bioinformatics

在具有条件的R中使用for循环

r for-loop

在具有条件的R中使用for循环,r,for-loop,bioinformatics,R,For Loop,Bioinformatics,我正在尝试编写一个rscript，它将使用bioconductor软件包中的biomaRt找到芯片序列峰值的注释。我正在从这里调整注释代码，我需要找到绑定峰值2.5到5kb范围内的TSS站点。与示例网站不同的是，我必须对整个基因组进行分析我知道代码用于注释——目前，我将代码块复制了22次，而不是循环如果正在迭代的染色体上没有峰值，我还需要找到避免脚本退出的方法 #!/usr/bin/Rscript --vanilla --slave # Change to data directory s

我正在尝试编写一个rscript，它将使用bioconductor软件包中的biomaRt找到芯片序列峰值的注释。我正在从这里调整注释代码，我需要找到绑定峰值2.5到5kb范围内的TSS站点。与示例网站不同的是，我必须对整个基因组进行分析

我知道代码用于注释——目前，我将代码块复制了22次，而不是循环

如果正在迭代的染色体上没有峰值，我还需要找到避免脚本退出的方法

#!/usr/bin/Rscript --vanilla --slave

# Change to data directory
setwd("/data/met/bowtie_out/tAfiles/MLE15/2/");

# send the output to a file AND the Terminal
sink("09June2013_spp.txt", append=FALSE, split=TRUE);

# Load Libraries
library(biomaRt);
library(plyr);

load("MLE15_pooled_2.Rdata");

# y equals the SPP score. I have to truncate it after IDR analysis.

bp <- llply(region.peaks$npl, subset, y > 12.0633)

print(paste("After filtering",sum(unlist(lapply(bp,function(d) length(d$x)))),"peaks remain"));

save.image(file="09Jun13_1.RData");

# begin collecting annotation have to use mm10.

ensembl= useMart('ensembl', dataset='mmusculus_gene_ensembl', host="apr2013.archive.ensembl.org");

# need to make a for loop that will loop through all of the chromosomes and not error out if no peaks are on that chromosome.

# So for this 'for' loop in R do I need to make a list? like for (z in (c('1'....etc?

for ( z in ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'X', 'Y', 'M' ) {

# To insert the variable which I am looping on with other variables, is there something I should add like the $ in bash for loops that is specific to R? can I put the variable in quotes - like for the input to biomaRt? ex. values= "z"?

genes.chrz = getBM(attributes = c("chromosome_name", "start_position", "end_position", "strand", "description", "entrezgene"), filters = "chromosome_name", values= "z", mart = ensembl);

overlap = function(bs, ts, l)
{
    if ((bs > ts - l) && (bs < ts + l)) {
        TRUE;
    } else {
        FALSE;
    }
}

fivePrimeGenes = function(bs, ts, te, s, l, n, c, d)
{
    fivePrimeVec = logical();
    for (i in 1:length(ts)) {
            fivePrime = FALSE;
            for (j in 1:length(bs)) {
                if (s[i] == 1) {
                    fivePrime = fivePrime || overlap(bs[j], ts[i], l);
                } else {
                    fivePrime = fivePrime || overlap(bs[j], te[i], l);
                }
             }
            fivePrimeVec = c(fivePrimeVec, fivePrime);
    }
     fivePrimeVec;
}

fivePrimeGenesLogical = fivePrimeGenes(bp$chrz$x, genes.chrz$start_position, genes.chrz$end_position, genes.chrz$strand, 5000, genes.chrz$entrezgene, genes.chrz$chromosome_name, genes.chrz$description);
fivePrimeStartsPlus = genes.chrz$start_position[fivePrimeGenesLogical & genes.chrz$strand == 1]
fivePrimeStartsMinus = genes.chrz$end_position[fivePrimeGenesLogical & genes.chrz$strand == -1]
fivePrimeStarts = sort(c(fivePrimeStartsPlus, fivePrimeStartsMinus));

entrezgene <- data.frame(genes.chrz$entrezgene[fivePrimeGenesLogical]);
chromosome_name <- data.frame(genes.chrz$chromosome_name[fivePrimeGenesLogical]);
start_pos <- data.frame(genes.chrz$start_position[fivePrimeGenesLogical]);
end_pos <- data.frame(genes.chrz$end_position[fivePrimeGenesLogical]);
strand <- data.frame(genes.chrz$strand[fivePrimeGenesLogical]);
description <- data.frame(genes.chrz$description[fivePrimeGenesLogical]);


AnnotationData <- cbind(chromosome_name, entrezgene, start_pos, end_pos, strand, description);
write.table(AnnotationData, file="chrz_annotation_data.csv", row.names=FALSE, col.names=FALSE, sep="\t");

}

save.image(file="09Jun13_2.RData");

# close the output file
sink()

# clean all
rm(list=ls(all=TRUE));

quit("yes");

#/usr/bin/Rscript--香草--从
#更改到数据目录
setwd（“/data/met/bowtie_out/tAfiles/MLE15/2/”）；
#将输出发送到文件和终端
sink（“09June2013_spp.txt”，append=FALSE，split=TRUE）；
#加载库
图书馆（生物艺术）；
图书馆（plyr）；
加载（“MLE15汇集的数据”）；
#y等于SPP分数。我必须在IDR分析后截断它。
英国石油公司12.0633）
打印（粘贴（“过滤后”，求和（未列出（lappy（bp，函数（d）长度（d$x））），“保留峰值”）；
save.image（file=“09Jun13_1.RData”）；
#开始收集注释必须使用mm10。
ensembl=useMart（'ensembl'，dataset='mmusculus_gene_ensembl'，host=“apr2013.archive.ensembl.org”）；
#需要做一个for循环，该循环将遍历所有染色体，如果该染色体上没有峰，则不会出错。
#因此，对于R中的“for”循环，我需要列出一个列表吗？比如（c（'1'..等）中的（z）？
（z在（'1'，'2'，'3'，'4'，'5'，'6'，'7'，'8'，'9'，'10'，'11'，'12'，'13'，'14'，'15'，'16'，'17'，'18'，'19'，'X'，'Y'，'M'）{
#要插入我与其他变量循环的变量，我是否应该添加一些东西，例如特定于R的循环的bash中的$in？我可以将变量放在引号中吗？例如，对于biomaRt的输入？ex.values=“z”？
genes.chrz=getBM（attributes=c（“染色体名称”、“起始位置”、“结束位置”、“链”、“描述”、“entrezgene”）、filters=“染色体名称”、values=“z”、mart=ensembl）；
重叠=功能（bs、ts、l）
{
如果（（bs>ts-l）和&（bsentrezgene让我们获取注释；在这个阶段不需要在染色体上循环
library(biomaRt)
ensembl <- useMart('ENSEMBL_MART_ENSEMBL', dataset='mmusculus_gene_ensembl',
                   host="apr2013.archive.ensembl.org")
genes.chrz <- getBM(attributes = c("chromosome_name", "start_position",
                      "end_position", "strand", "description", "entrezgene"),
                    mart = ensembl)

看起来你对每个基因开始的5000个新台币侧翼（5英尺）感兴趣
flanks <- flank(genes, 5000)

在和登录页上有大量的小插曲；邮件列表是一个有用的资源
但是如果你真的想使用Matt提供的代码，我首先将所有的“；”（在R中不需要），并将代码包装到80列以便于可读。然后，我将常量函数定义拉到循环之外，并修改了overlap
，以操作向量ts
，而不是标量
overlap = function(bs, ts, l)
{
    ## suppose ts is a vector, bs and l scalars, then use single '&';
    ## result is a logical(length(ts)) with appropriate values TRUE or
    ## FALSE
    (bs > ts - l) & (bs < ts + l)
}

好的，对于主循环，让R创建序列1:19，并强制为字符。使用变量z
而不是字符串“z”查询biomaRt时，以及创建要向其写入数据的文件名时。与
一起使用是一种方便/有害的做法；在这种情况下，它有助于整理代码，因此我选择使用它。fivePrimeStarts
在后续计算中未使用，因此我将其删除。似乎可以将注释数据创建为现有数据的子集框架，而不是从子集零件进行容易出错的装配
for ( z in c( 1:19, "X", "Y", "M" ) )  {
    ## use the variable z, rather than the string "z"; order
    ## attributes as wanted in final representation
    attributes <- c("entrezgene", "chromosome_name", "start_position",
                    "end_position", "strand", "description",
                    "entrezgene")
    genes.chrz = getBM(attributes = attributes,
      filters = "chromosome_name", values=z, mart = ensembl)

    ## use 'with' to simplify common column selection
    fivePrimeGenesLogical = with(genes.chrz, {
        fivePrimeGenes(bp$chrz$x, start_position, end_position, strand, 5000)
    })

    ## AnnotationData as subset, rather than assembling
    AnnotationData <- genes.chrz[fivePrimeGenesLogical,, drop=FALSE]
    ## as a matrix? as.matrix(AnnotationData)

    ## use 'sprintf' to create a file name
    write.table(AnnotationData,
                file=sprintf("chr%s_annotation_data.csv", z),
                row.names=FALSE, col.names=FALSE, sep="\t")
}

（c中的z（1:19，“X”，“Y”，“M”））{
##使用变量z，而不是字符串“z”；顺序
##最终表示中所需的属性
属性确实很混乱。但这里有一个提示：尝试for（c中的z（如字符（1:19），“X”，“Y”，“M”））
。为了简洁起见，将for循环调用更改为for（c中的z（1:19，“X”，“Y”，“M”））。您也可以将函数声明从循环中删除。好的，谢谢Thomas。Haki哪个函数？谢谢Martin。我可以让它正常工作，只是在脚本中重复了22次。我想我应该在R中提出一个更直接的问题。我想循环是不必要的，而且写得非常低效；避免所有问题在没有循环的情况下处理向量，并使用为您试图解决的问题而设计的现有软件包（如GenomicRanges）所带来的麻烦。@Mattshorton我在我的答案中添加了使用您的代码的内容；您所寻找的答案可能在getBM和write.table行中，但希望其他建议也能有所帮助。
overlap = function(bs, ts, l)
{
    ## suppose ts is a vector, bs and l scalars, then use single '&';
    ## result is a logical(length(ts)) with appropriate values TRUE or
    ## FALSE
    (bs > ts - l) & (bs < ts + l)
}

fivePrimeGenes = function(bs, ts, te, s, l)
{
    ## use 'pre-allocate and fill' rather than copy-and-append
    fivePrimeVec <- logical(length(ts)) # initially all FALSE
    ## choose strand for all elements of ts, te;
    se <- ifelse(s == 1, ts, te)
    ## overlap is vectorized, no need to iterate over elements of se
    ## use seq_along() to avoid edge case of 0-length ts
    for (j in seq_along(bs))
        ## calculate overlaps for each element of bs
        fivePrimeVec <- fivePrimeVec | overlap(bs[j], se, l)
    fivePrimeVec
}

for ( z in c( 1:19, "X", "Y", "M" ) )  {
    ## use the variable z, rather than the string "z"; order
    ## attributes as wanted in final representation
    attributes <- c("entrezgene", "chromosome_name", "start_position",
                    "end_position", "strand", "description",
                    "entrezgene")
    genes.chrz = getBM(attributes = attributes,
      filters = "chromosome_name", values=z, mart = ensembl)

    ## use 'with' to simplify common column selection
    fivePrimeGenesLogical = with(genes.chrz, {
        fivePrimeGenes(bp$chrz$x, start_position, end_position, strand, 5000)
    })

    ## AnnotationData as subset, rather than assembling
    AnnotationData <- genes.chrz[fivePrimeGenesLogical,, drop=FALSE]
    ## as a matrix? as.matrix(AnnotationData)

    ## use 'sprintf' to create a file name
    write.table(AnnotationData,
                file=sprintf("chr%s_annotation_data.csv", z),
                row.names=FALSE, col.names=FALSE, sep="\t")
}