R 将基因位置映射到染色体坐标

R 将基因位置映射到染色体坐标,r,sapply,R,Sapply,第一篇文章在这里,所以我希望我能解释自己在最好的 我需要交叉引用两个数据帧,通过查找两个数据帧之一中给定的一个特定染色体位置是否出现在另一个数据帧提供的范围内,因此我希望有一个包含该范围内基因的新列 “基因”是数据框,坐标(开始/结束)被视为范围 head(genes) # A tibble: 6 x 9 chr source type start end strand gene_id symbol gene_bioty

第一篇文章在这里,所以我希望我能解释自己在最好的

我需要交叉引用两个数据帧,通过查找两个数据帧之一中给定的一个特定染色体位置是否出现在另一个数据帧提供的范围内,因此我希望有一个包含该范围内基因的新列

“基因”是数据框,坐标(开始/结束)被视为范围

head(genes)
# A tibble: 6 x 9
  chr   source         type      start       end strand gene_id         symbol        gene_biotype  
  <chr> <chr>          <chr>     <int>     <int> <chr>  <chr>           <chr>         <chr>         
1 2     pseudogene     gene  143300987 143301544 +      ENSG00000228134 AC092578.1    pseudogene    
2 2     pseudogene     gene  143611664 143613567 +      ENSG00000229781 AC013444.1    pseudogene    
3 2     protein_coding gene  143635067 143799890 +      ENSG00000115919 KYNU          protein_coding
4 2     pseudogene     gene  143704869 143705655 -      ENSG00000270390 RP11-470B22.1 pseudogene    
5 2     miRNA          gene  143763269 143763360 -      ENSG00000221169 AC013444.2    miRNA         
6 2     protein_coding gene  143848931 144525921 +      ENSG00000075884 ARHGAP15      protein_coding
我基本上必须找出“点A”是否在(基因)的“开始”/“结束”范围内,以及与哪个基因符号相关

我尝试了以下方法:

x$geneA <- ifelse(sapply(x$`point A`, function(g)
  any(genes$start >= g & genes$end <=g)), genes$symbol, NA)
x$geneA=g&genes$end这有效吗

我假设每个点只匹配一个基因符号

x$geneA <- sapply(x$`point A`,
                  function(g) filter(genes, g >= start & g <= end)$symbol[1])

x$geneA=start&gA用于基于循环的解决方案。(当然,这比使用
apply
要慢得多)

#数据的模型

symbol欢迎来到Stackoverflow!以后,请发布一个最小的、可行的示例()

用于获取合并表,其中非匹配行为
NA
。或者只需找到匹配的,而不使用嵌套的tibble

x %>% 
    left_join(genes, by = c("chr_a" = "chr")) %>% 
    filter(`point A` >= start & `point A` <= end)
x%>%
左连接(基因,by=c(“chr\u a”=“chr”))%>%

filter(`pointa`>=start&`pointa`您可以尝试下面的基本R代码

df2out <- within(df2,symbol <- sapply(A, function(x) df1$symbol[which(x>=df1$start & x<=df1$end)]))
数据

df1 <- structure(list(chr = c(2L, 2L, 2L, 2L, 2L, 2L), source = c("pseudogene", 
"pseudogene", "protein_coding", "pseudogene", "miRNA", "protein_coding"
), type = c("gene", "gene", "gene", "gene", "gene", "gene"), 
    start = c(143300987L, 143611664L, 143635067L, 143704869L, 
    143763269L, 143848931L), end = c(143301544L, 143613567L, 
    143799890L, 143705655L, 143763360L, 144525921L), strand = c("+", 
    "+", "+", "-", "-", "+"), gene_id = c("ENSG00000228134", 
    "ENSG00000229781", "ENSG00000115919", "ENSG00000270390", 
    "ENSG00000221169", "ENSG00000075884"), symbol = c("AC092578.1", 
    "AC013444.1", "KYNU", "RP11-470B22.1", "AC013444.2", "ARHGAP15"
    ), gene_biotype = c("pseudogene", "pseudogene", "protein_coding", 
    "pseudogene", "miRNA", "protein_coding")), class = "data.frame", row.names = c(NA, 
-6L))

df2 <- structure(list(chr_a = 1:6, point = c(2L, 2L, 2L, 2L, 2L, 2L), 
    A = c(143301002L, 143625061L, 143700941L, 143811317L, 144127323L, 
    144224689L)), class = "data.frame", row.names = c(NA, -6L
))

df1这个答案很可能永远看不到=p

有这样的包。请注意,您的代码不能处理额外的染色体或链

使用@koenniem提供的数据

library(GenomicRanges)

gr1 = makeGRangesFromDataFrame(genes,keep.extra.columns=TRUE)

x = data.frame(x,check.names=FALSE)
gr2 = GRanges(seqnames=x$chr_a,IRanges(start=x[,"point A"],width=1))

x$gene = NA
ovlp = findOverlaps(gr2,gr1)
x$gene[queryHits(ovlp)] = gr1$symbol[subjectHits(ovlp)]

  chr_a   point A       gene
1     2 143301002 AC092578.1
2     2 143625061       <NA>
3     2 143700941       KYNU
4     2 143811317       <NA>
5     2 144127323   ARHGAP15
6     2 144224689   ARHGAP15
库(基因组范围)
gr1=makeGRangesFromDataFrame(基因,keep.extra.columns=TRUE)
x=data.frame(x,check.names=FALSE)
gr2=田庄(seqnames=x$chr_a,IRanges(起点=x[,“点a”],宽度=1))
x$gene=NA
ovlp=FindVerlaps(gr2、gr1)
x$基因[queryHits(ovlp)]=gr1$符号[subjectHits(ovlp)]
chr_a点a基因
1 2 143301002 AC092578.1
2     2 143625061       
3 2 143700941 KYNU
4     2 143811317       
52144127323ARHGAP15
6 2 144224689 ARHGAP15

这是否回答了您的问题?以及
x %>% 
    nest_join(genes, by = c("chr_a" = "chr")) %>% 
    group_by(`point A`) %>% 
    mutate(genes = map(genes, ~filter(., `point A` >= start & `point A` <= end))) %>% 
    unnest(genes, keep_empty = TRUE)
x %>% 
    left_join(genes, by = c("chr_a" = "chr")) %>% 
    filter(`point A` >= start & `point A` <= end)
df2out <- within(df2,symbol <- sapply(A, function(x) df1$symbol[which(x>=df1$start & x<=df1$end)]))
> df2out
  chr_a point         A     symbol
1     1     2 143301002 AC092578.1
2     2     2 143625061           
3     3     2 143700941       KYNU
4     4     2 143811317           
5     5     2 144127323   ARHGAP15
6     6     2 144224689   ARHGAP15
df1 <- structure(list(chr = c(2L, 2L, 2L, 2L, 2L, 2L), source = c("pseudogene", 
"pseudogene", "protein_coding", "pseudogene", "miRNA", "protein_coding"
), type = c("gene", "gene", "gene", "gene", "gene", "gene"), 
    start = c(143300987L, 143611664L, 143635067L, 143704869L, 
    143763269L, 143848931L), end = c(143301544L, 143613567L, 
    143799890L, 143705655L, 143763360L, 144525921L), strand = c("+", 
    "+", "+", "-", "-", "+"), gene_id = c("ENSG00000228134", 
    "ENSG00000229781", "ENSG00000115919", "ENSG00000270390", 
    "ENSG00000221169", "ENSG00000075884"), symbol = c("AC092578.1", 
    "AC013444.1", "KYNU", "RP11-470B22.1", "AC013444.2", "ARHGAP15"
    ), gene_biotype = c("pseudogene", "pseudogene", "protein_coding", 
    "pseudogene", "miRNA", "protein_coding")), class = "data.frame", row.names = c(NA, 
-6L))

df2 <- structure(list(chr_a = 1:6, point = c(2L, 2L, 2L, 2L, 2L, 2L), 
    A = c(143301002L, 143625061L, 143700941L, 143811317L, 144127323L, 
    144224689L)), class = "data.frame", row.names = c(NA, -6L
))
library(GenomicRanges)

gr1 = makeGRangesFromDataFrame(genes,keep.extra.columns=TRUE)

x = data.frame(x,check.names=FALSE)
gr2 = GRanges(seqnames=x$chr_a,IRanges(start=x[,"point A"],width=1))

x$gene = NA
ovlp = findOverlaps(gr2,gr1)
x$gene[queryHits(ovlp)] = gr1$symbol[subjectHits(ovlp)]

  chr_a   point A       gene
1     2 143301002 AC092578.1
2     2 143625061       <NA>
3     2 143700941       KYNU
4     2 143811317       <NA>
5     2 144127323   ARHGAP15
6     2 144224689   ARHGAP15