Perl 匹配不同行上的列并求和

Perl 匹配不同行上的列并求和,perl,Perl,我有一个大约160000行的csv,它看起来像这样: chr1,160,161,3,0.333333333333333,+ chr1,161,162,4,0.5,- chr1,309,310,14,0.0714285714285714,+ chr1,311,312,2,0.5,- chr1,499,500,39,0.717948717948718,+ chr2,500,501,8,0.375,- chr2,510,511,1

我有一个大约160000行的csv,它看起来像这样:

chr1,160,161,3,0.333333333333333,+         
chr1,161,162,4,0.5,-      
chr1,309,310,14,0.0714285714285714,+     
chr1,311,312,2,0.5,-     
chr1,499,500,39,0.717948717948718,+     
chr2,500,501,8,0.375,-      
chr2,510,511,18,0.5,+         
chr2,511,512,6,0.333333333333333,-    
chr1,160,161,7,0.833333333333333,+         
chr1,309,310,14,0.0714285714285714,+     
chr1,311,312,2,0.5,-     
chr1,499,500,39,0.717948717948718,+     
chr2,500,501,8,0.375,-      
chr2,510,511,24,0.833333333333333,-  
#!/usr/bin/perl             
use strict;      
use warnings;          
open my $firstfile, '<', $ARGV[0] or die "$!";         
open my $secondfile, '<', $ARGV[1] or die "$!";            
my ($chr_a, $chr_b,$start,$end,$begin,$finish, $sum_a, $sum_b, $total_a, 
    $total_b,$sign_a,$sign_b);             

while (<$firstfile>) {
    my @col = split /,/;
    $chr_a  = $col[0];
    $start  = $col[1];
    $end    = $col[2];
    $sum_a  = $col[3];
    $total_a = $col[4];
    $sign_a = $col[5];

    seek($secondfile,0,0);
    while (<$secondfile>) {
       my @seccol = split /,/;
       $chr_b     = $seccol[0];
       $begin     = $seccol[1];
       $finish    = $seccol[2];
       $sum_b     = $seccol[3];
       $total_b   = $seccol[4];
       $sign_b    = $seccol[5];

       print join ("\t", $col[0], $col[1], $col[2], $col[3]+=$seccol[3], 
                         $col[4]+=$seccol[4], $col[5]), 
           "\n" if ($chr_a eq $chr_b and $end==$begin and $sign_a ne $sign_b);
    }
我想对第1列相同、第3列与第2列匹配、第6列为
'+'
而另一行为
'-'
的行进行配对。如果这是真的,我想对第4列和第5列求和

chr1,160,161,7,0.833333333333333,+         
chr1,309,310,14,0.0714285714285714,+     
chr1,311,312,2,0.5,-     
chr1,499,500,39,0.717948717948718,+     
chr2,500,501,8,0.375,-      
chr2,510,511,24,0.833333333333333,-  
#!/usr/bin/perl             
use strict;      
use warnings;          
open my $firstfile, '<', $ARGV[0] or die "$!";         
open my $secondfile, '<', $ARGV[1] or die "$!";            
my ($chr_a, $chr_b,$start,$end,$begin,$finish, $sum_a, $sum_b, $total_a, 
    $total_b,$sign_a,$sign_b);             

while (<$firstfile>) {
    my @col = split /,/;
    $chr_a  = $col[0];
    $start  = $col[1];
    $end    = $col[2];
    $sum_a  = $col[3];
    $total_a = $col[4];
    $sign_a = $col[5];

    seek($secondfile,0,0);
    while (<$secondfile>) {
       my @seccol = split /,/;
       $chr_b     = $seccol[0];
       $begin     = $seccol[1];
       $finish    = $seccol[2];
       $sum_b     = $seccol[3];
       $total_b   = $seccol[4];
       $sign_b    = $seccol[5];

       print join ("\t", $col[0], $col[1], $col[2], $col[3]+=$seccol[3], 
                         $col[4]+=$seccol[4], $col[5]), 
           "\n" if ($chr_a eq $chr_b and $end==$begin and $sign_a ne $sign_b);
    }
我期望的结果是

chr1,160,161,7,0.833333333333333,+         
chr1,309,310,14,0.0714285714285714,+     
chr1,311,312,2,0.5,-     
chr1,499,500,39,0.717948717948718,+     
chr2,500,501,8,0.375,-      
chr2,510,511,24,0.833333333333333,-  
#!/usr/bin/perl             
use strict;      
use warnings;          
open my $firstfile, '<', $ARGV[0] or die "$!";         
open my $secondfile, '<', $ARGV[1] or die "$!";            
my ($chr_a, $chr_b,$start,$end,$begin,$finish, $sum_a, $sum_b, $total_a, 
    $total_b,$sign_a,$sign_b);             

while (<$firstfile>) {
    my @col = split /,/;
    $chr_a  = $col[0];
    $start  = $col[1];
    $end    = $col[2];
    $sum_a  = $col[3];
    $total_a = $col[4];
    $sign_a = $col[5];

    seek($secondfile,0,0);
    while (<$secondfile>) {
       my @seccol = split /,/;
       $chr_b     = $seccol[0];
       $begin     = $seccol[1];
       $finish    = $seccol[2];
       $sum_b     = $seccol[3];
       $total_b   = $seccol[4];
       $sign_b    = $seccol[5];

       print join ("\t", $col[0], $col[1], $col[2], $col[3]+=$seccol[3], 
                         $col[4]+=$seccol[4], $col[5]), 
           "\n" if ($chr_a eq $chr_b and $end==$begin and $sign_a ne $sign_b);
    }
我能想到的最好的解决方案是复制文件,然后使用perl在文件和重复文件之间匹配列:

chr1,160,161,7,0.833333333333333,+         
chr1,309,310,14,0.0714285714285714,+     
chr1,311,312,2,0.5,-     
chr1,499,500,39,0.717948717948718,+     
chr2,500,501,8,0.375,-      
chr2,510,511,24,0.833333333333333,-  
#!/usr/bin/perl             
use strict;      
use warnings;          
open my $firstfile, '<', $ARGV[0] or die "$!";         
open my $secondfile, '<', $ARGV[1] or die "$!";            
my ($chr_a, $chr_b,$start,$end,$begin,$finish, $sum_a, $sum_b, $total_a, 
    $total_b,$sign_a,$sign_b);             

while (<$firstfile>) {
    my @col = split /,/;
    $chr_a  = $col[0];
    $start  = $col[1];
    $end    = $col[2];
    $sum_a  = $col[3];
    $total_a = $col[4];
    $sign_a = $col[5];

    seek($secondfile,0,0);
    while (<$secondfile>) {
       my @seccol = split /,/;
       $chr_b     = $seccol[0];
       $begin     = $seccol[1];
       $finish    = $seccol[2];
       $sum_b     = $seccol[3];
       $total_b   = $seccol[4];
       $sign_b    = $seccol[5];

       print join ("\t", $col[0], $col[1], $col[2], $col[3]+=$seccol[3], 
                         $col[4]+=$seccol[4], $col[5]), 
           "\n" if ($chr_a eq $chr_b and $end==$begin and $sign_a ne $sign_b);
    }
#/usr/bin/perl
严格使用;
使用警告;

打开我的$firstfile,“如果我的评论没有回复,这个程序将按照您的要求使用您提供的数据

chr1,160,161,7,0.833333333333333,+         
chr1,309,310,14,0.0714285714285714,+     
chr1,311,312,2,0.5,-     
chr1,499,500,39,0.717948717948718,+     
chr2,500,501,8,0.375,-      
chr2,510,511,24,0.833333333333333,-  
#!/usr/bin/perl             
use strict;      
use warnings;          
open my $firstfile, '<', $ARGV[0] or die "$!";         
open my $secondfile, '<', $ARGV[1] or die "$!";            
my ($chr_a, $chr_b,$start,$end,$begin,$finish, $sum_a, $sum_b, $total_a, 
    $total_b,$sign_a,$sign_b);             

while (<$firstfile>) {
    my @col = split /,/;
    $chr_a  = $col[0];
    $start  = $col[1];
    $end    = $col[2];
    $sum_a  = $col[3];
    $total_a = $col[4];
    $sign_a = $col[5];

    seek($secondfile,0,0);
    while (<$secondfile>) {
       my @seccol = split /,/;
       $chr_b     = $seccol[0];
       $begin     = $seccol[1];
       $finish    = $seccol[2];
       $sum_b     = $seccol[3];
       $total_b   = $seccol[4];
       $sign_b    = $seccol[5];

       print join ("\t", $col[0], $col[1], $col[2], $col[3]+=$seccol[3], 
                         $col[4]+=$seccol[4], $col[5]), 
           "\n" if ($chr_a eq $chr_b and $end==$begin and $sign_a ne $sign_b);
    }
use strict;
use warnings;

my @last;

while (<DATA>) {
  s/\s+\z//;
  my @line = split /,/;

  if (@last
      and $last[0] eq $line[0]
      and $last[2] eq $line[1]
      and $last[5] eq '+' and $line[5] eq '-') {

    $last[3] += $line[3];
    $last[4] += $line[4];
    print join(',', @last), "\n";
    @last = ()
  }
  else {
    print join(',', @last), "\n" if @last;
    @last = @line;
  }
}

print join(',', @last), "\n" if @last;

__DATA__
chr1,160,161,3,0.333333333333333,+         
chr1,161,162,4,0.5,-      
chr1,309,310,14,0.0714285714285714,+     
chr1,311,312,2,0.5,-     
chr1,499,500,39,0.717948717948718,+     
chr2,500,501,8,0.375,-      
chr2,510,511,18,0.5,+         
chr2,511,512,6,0.333333333333333,-

您正在文件本身中执行此操作。您可以在多个文件上运行它,只需在其周围放置一个循环。也许你可以澄清一下你想问的是什么?我想知道是否有更简单的方法。第一列的值总是分组在一起吗?第3列和第2列之间的匹配是否总是在连续的行上,匹配列是第一行中的第三列,第二列是下一行中的第二列?您应该意识到,不需要复制一个文件来读取具有多个文件句柄的数据:只需打开两次即可。但恐怕我猜不出你还有什么问题。下面我的解决方案将修改后的数据发送到STDOUT,您可以将其重定向到任何您想要的位置,但我建议您不要就地覆盖该文件,因为任何运行都可能发现代码中的错误,从而使该文件无法恢复。是的,这是连续的,您的解决方案解决了我的问题。