计算与R中另一个数据帧重叠的值之和

计算与R中另一个数据帧重叠的值之和,r,R,我有两个数据帧,如下所示 depth chr Pos Nucleotide Coverage chr1 1 A 10 chr1 2 G 12 chr1 3 T 3 chr1 4 A 20 chr1 5 T 22 chr1 6 N 0 chr1 7 N 0 chr2 23 A 1 chr2 24 T 5

我有两个数据帧,如下所示

depth
chr  Pos Nucleotide Coverage
chr1 1   A          10
chr1 2   G          12
chr1 3   T          3
chr1 4   A          20
chr1 5   T          22
chr1 6   N          0
chr1 7   N          0
chr2 23  A          1
chr2 24  T          5
chr2 25  G          15
和另一个间隔数据帧

intervals

chr1  3  5
chr2 23 25
chr4  1 30
我的期望输出如下:如果
深度
数据框中的位置在
间隔
数据框中所示的范围内,并且具有相同的
chr
值,则计算该范围内所有核苷酸的
覆盖率
总和,并将其分配到第4列

chr1  3  5 45
chr2 23 25 21
chr4  1 30  0


如何使用R创建这两个数据帧。我有非常大的深度数据帧文件,大小为50GB

dplyr
非常适合这些类型的操作:

# first, read in the data, with headers
depth <- read.table(header = T, text = 
"chr  Pos Nucleotide Coverage
chr1 1   A          10
chr1 2   G          12
chr1 3   T          3
chr1 4   A          20
chr1 5   T          22
chr1 6   N          0
chr1 7   N          0
chr2 23  A          1
chr2 24  T          5
chr2 25  G          15")

intervals <- read.table(header = T, text =
"chr  start   end
chr1  3  5
chr2 23 25
chr4  1 30")

dplyr
非常适合这些类型的操作:

# first, read in the data, with headers
depth <- read.table(header = T, text = 
"chr  Pos Nucleotide Coverage
chr1 1   A          10
chr1 2   G          12
chr1 3   T          3
chr1 4   A          20
chr1 5   T          22
chr1 6   N          0
chr1 7   N          0
chr2 23  A          1
chr2 24  T          5
chr2 25  G          15")

intervals <- read.table(header = T, text =
"chr  start   end
chr1  3  5
chr2 23 25
chr4  1 30")

您可以使用
sqldf

library(sqldf)

out1 <- sqldf('
select    i.*
          , coalesce(sum(d.Coverage), 0) as CovSum
from      intervals i
          left join depth d
            on  d.Pos between i.low and i.high
                and d.chr = i.chr
group by  i.chr, i.low, i.high
')
out1        
#    chr low high CovSum
# 1 chr1   3    5     45
# 2 chr2  23   25     21
# 3 chr4   1   30      0

out2 <- sqldf('
select    d.*
from      intervals i
          join depth d
            on  d.Pos between i.low and i.high
                and d.chr = i.chr
')
out2
#    chr Pos Nucleotide Coverage
# 1 chr1   3          T        3
# 2 chr1   4          A       20
# 3 chr1   5          T       22
# 4 chr2  23          A        1
# 5 chr2  24          T        5
# 6 chr2  25          G       15
库(sqldf)

out1您可以使用
sqldf

library(sqldf)

out1 <- sqldf('
select    i.*
          , coalesce(sum(d.Coverage), 0) as CovSum
from      intervals i
          left join depth d
            on  d.Pos between i.low and i.high
                and d.chr = i.chr
group by  i.chr, i.low, i.high
')
out1        
#    chr low high CovSum
# 1 chr1   3    5     45
# 2 chr2  23   25     21
# 3 chr4   1   30      0

out2 <- sqldf('
select    d.*
from      intervals i
          join depth d
            on  d.Pos between i.low and i.high
                and d.chr = i.chr
')
out2
#    chr Pos Nucleotide Coverage
# 1 chr1   3          T        3
# 2 chr1   4          A       20
# 3 chr1   5          T       22
# 4 chr2  23          A        1
# 5 chr2  24          T        5
# 6 chr2  25          G       15
库(sqldf)

out1为什么范围1到30的总和为0,但子范围23到25的总和大于0?要求和的列是什么?为什么从第二次输出中排除间隔1到30?@IceCreamToucan正在求和深度数据帧的覆盖列。chr4 1 30的和是0,因为深度数据框中没有重叠值。@IceCreamToucan我希望在第二个输出中也包括chr4范围位置。哦,错过了
chr
是连接的一部分这一事实为什么1到30的和是0,而23到25的子范围的和是>0?要求和的列是什么?为什么从第二次输出中排除间隔1到30?@IceCreamToucan正在求和深度数据帧的覆盖列。chr4 1 30的总和是0,因为深度数据框中没有重叠值。@IcecreamTouch我想在第二个输出中也包括chr4范围位置。哦,忽略了
chr
是join的一部分这一事实。我想这确实解决了当一个或两个对象都不适合RAM时该怎么办的问题。我想这确实解决了当一个或两个对象都不适合RAM时该怎么办的问题。谁对这个答案投了反对票/谁对这个答案投了反对票/
library(sqldf)

out1 <- sqldf('
select    i.*
          , coalesce(sum(d.Coverage), 0) as CovSum
from      intervals i
          left join depth d
            on  d.Pos between i.low and i.high
                and d.chr = i.chr
group by  i.chr, i.low, i.high
')
out1        
#    chr low high CovSum
# 1 chr1   3    5     45
# 2 chr2  23   25     21
# 3 chr4   1   30      0

out2 <- sqldf('
select    d.*
from      intervals i
          join depth d
            on  d.Pos between i.low and i.high
                and d.chr = i.chr
')
out2
#    chr Pos Nucleotide Coverage
# 1 chr1   3          T        3
# 2 chr1   4          A       20
# 3 chr1   5          T       22
# 4 chr2  23          A        1
# 5 chr2  24          T        5
# 6 chr2  25          G       15
library(data.table)

depth <- fread('
chr  Pos Nucleotide Coverage
chr1 1   A          10
chr1 2   G          12
chr1 3   T          3
chr1 4   A          20
chr1 5   T          22
chr1 6   N          0
chr1 7   N          0
chr2 23  A          1
chr2 24  T          5
chr2 25  G          15
')

intervals <- fread('
chr   low high
chr1  3  5
chr2 23 25
chr4  1 30
')