Perl 从模式中查找两个文件之间的最长匹配_Perl_Pattern Matching_Substring_Hashtable

Perl 从模式中查找两个文件之间的最长匹配

perl

Perl 从模式中查找两个文件之间的最长匹配,perl,pattern-matching,substring,hashtable,Perl,Pattern Matching,Substring,Hashtable,我在这个程序中实现两个文件时遇到问题。我正在尝试访问文件$Q和$s的内容 print "Input the K value \n"; $k = <>; chomp $k; print "Input T\n"; $t = <>; chomp $t; %Qkmer = (); $i = 1; $query=' '; while ($line=<IN>) { chomp($line); if ($line=~ m/

我在这个程序中实现两个文件时遇到问题。我正在尝试访问文件

$Q

和

$s

的内容

print "Input the K value \n";
$k = <>;
chomp $k;

print "Input T\n";
$t = <>;
chomp $t;

%Qkmer = ();                      
$i = 1;

$query=' ';
while ($line=<IN>) {
chomp($line);
 if ($line=~ m/^>/ ) {
 next;
}
$query=$query.$line;
$line=~ s/(^|\n)[\n\s]*/$1/g;

 while (length($line) >= $k) {
   $line =~ m/(.{$k})/;
   if (! defined $Qkmer{$1}) {#every key not deined as the first match
     $Qkmer{$1} = $i;
   }
   $i++;
   $line = substr($line, 1, length($line) -1);
 }
}

open(MYDATA, '<', "data.txt");

while ($line=<MYDATA>) { \
  chomp($line);
  %Skmer = ();           # This initializes the hash called Skmer.
  $j = 1;

  if ($line=~ m/^>/ ) { #if the line starts with >
    next; #start on next line #separated characters
  }
  $line=~ s/^\s+|\s+$//g ; #remove all spaces from file
  while (length($line) >= $k) {
    $line =~ m/(.{$k})/;#match any k characters and only k characters in dna
    $Skmer{$1} = $j; #set the key position to $j and increase for each new key
    $j++;
    $line = substr($line, 1, length($line) -1); #this removes the first character in the current string
  }

  ###(56)###for($Skmerkey(keys %Skmer)){
    $i=$Skmer{$Skmerkey};
    if(defined $Qkmer($Skmerkey)){
      $j=$Qkmer($Skmerkey);
      }
      $S1=$line;
      $S2=$query;
      @arrayS1= split(//, $S1);
      @array2= split(//, $S2);

      $l=0;
      while($arrayS1[$i-$l] eq $arrayS2[$j-$l]){
        $l++;
      }
      $start=$i-$l;
      $m=0;
      while ($arrayS1[$i+$k+$m] eq $arrayS2[$j+$k+$m]) {
        $m++;
      }
      $length=$l+$k+$m;
      $match= substr($S1, $start, $length);

      if($length>$t){
        $longest=length($match);
        print "Longest: $match of length $longest \n";
      }
  }

}###(83)###

文件2:

ggujvfbgfgkjfcijjjffcvvafcsghnvzfhgvugxckugcbhfcgh
ghnvzfhgvugxckHhfgjgcfujvftjbvdtkhvddgjcdgjxdjkfrh
ajdbvciyqdanvkjghnvzfhgvugxc

从文件2中文件1中长度为

$k

的单词的匹配中，我从文件2中的匹配中检查单词的左侧和右侧，以获得进一步的匹配。最终输出是基于

$k

的文件1和文件2之间的最长匹配。现在我知道了

有了这段代码，我得到了一个语法错误，我不知道为什么，因为在我看来它是正确的：

syntax error at testk.pl line 56, near "$Skmerkey("
syntax error at testk.pl line 83, near "}"

谢谢。

使用strict；#/；
use strict;         # <--- Allways use this
use warnings;       # <--- and this
use Data::Dumper;

my $k=3;

open(my $IN, '<', "File2");  # use $IN instead of depricated IN
my $line=0;  # line number
my %kmer;    # hash of arrays of all $k-letter "words" line/position
my @Q;       # rows of Q-file
while(<$IN>) {
  chomp;
  next if /^>/;   
  s/^\s+|\s+$//g; 
  next if !$_;
  my $pos=0;
  push @Q, $_;    # store source row
  for(/(?=(.{$k}))/g) {  # Capture $k letters. floating window with step 1 symbol
   push @{$kmer{$_}}, [$line,$pos];  # store row number and position of "word"
   $pos++;
  }
  $line++;
}

open($IN, '<', "File1");
$line=0;
while(<$IN>) {   # Read S-file
  chomp;
  next if /^>/;
  s/^\s+|\s+$//g;
  next if !$_;
  my $pos=0;
  my $len=length($_);  # length of row of S-file
  my $s=$_;            # Current row of S-file
  my @ignore=();       # array for store information about match tails
  for(/(?=(.{$k}))/g) {
    next if ! $kmer{$_};  # "word" not found try to next
    for(@{$kmer{$_}}) {   # $kmer{word} contains array of lines/positions in Q
      my($qline, $qpos)=@{$_};
#      print "test $qline:$qpos ";
      if( grep {$_->[0]==$qline && $_->[1]==$qpos } @ignore ) {
      # this line/position already tested and included in found matching
#        print "Ignore match tail $qline:$qpos\n";
        next;
      }
      my $j=$k;  # $k letters same, test after this point
      my $qlen=length($Q[$qline]);
      $j++ while( $pos+$j<$len && $qpos+$j<$qlen &&
                  substr($s,$pos+$j,1) eq substr($Q[$qline],$qpos+$j,1) );
      print "MATCH FOUND: S-file line $line pos $pos, Q-file line $qline pos $qpos: ",
            substr($s,$pos,$j),"\n";
      push @ignore, [$qline, $qpos, $j];  # store positions and length of match
    }
  } continue {  # Continue block works on all loops, include after "next"
   $pos++;
   @ignore=grep { # recalculate/filter position and length of all match tails
                  ++$_->[1];  # increment position
                  (--$_->[2]) # decrement length
                   >= $k      # and filter out lengths < $k
                } @ignore;
#   print Dumper(\@ignore);
  }
  $line++;
}

s/^\s+|\s+$//g；
下一个if！$\ux；
我的$pos=0；
my$len=长度（$）；#S文件行的长度
我的$s=$#S文件的当前行
我的@ignore=（）；#用于存储有关匹配尾部的信息的数组
对于（/（？=（.{$k}））/g）{
下一个如果！$kmer{$\u}；#“word”未找到，请尝试下一个
for（@{$kmer{${}）{{#$kmer{word}包含Q中的行/位置数组
my（$qline，$qpos）=@{$}；
#打印“测试$qline:$QPO”；
if（grep{$\>[0]=$qline&&$\>[1]=$qpos}@ignore）{
#此线/位置已测试并包含在找到的匹配中
#打印“忽略匹配尾部$qline:$qpos\n”；
下一个
}
我的$j=$k；#$k个字母相同，在这一点之后测试
my$qlen=长度（$Q[$qline]）；
$j++而（$pos+$j[2]）#递减长度
>=$k#并过滤掉长度<$k
}@忽略；
#打印转储程序（\@忽略）；
}
$line++；
}

$k

已使用，但未定义<代码>使用警告；严格使用哈希

%kmer

始终为空！！！是的，我只是从我的编辑器中直接将代码放在这里，所以我必须手动添加空格，以便将所有内容都放在一个代码块中。在我的实际程序中，注释掉的区域没有注释掉。我使用散列来区分我特别遇到问题的区域，并且可能需要以另一种方式实现。在1符号中的分析步骤中，双向搜索没有意义。如果将第二个文件中的步骤增加到$k，则可以使用此方法。但在这种情况下，该保证将查找最小长度为$k*2-1的所有匹配项。

use strict;         # <--- Allways use this
use warnings;       # <--- and this
use Data::Dumper;

my $k=3;

open(my $IN, '<', "File2");  # use $IN instead of depricated IN
my $line=0;  # line number
my %kmer;    # hash of arrays of all $k-letter "words" line/position
my @Q;       # rows of Q-file
while(<$IN>) {
  chomp;
  next if /^>/;   
  s/^\s+|\s+$//g; 
  next if !$_;
  my $pos=0;
  push @Q, $_;    # store source row
  for(/(?=(.{$k}))/g) {  # Capture $k letters. floating window with step 1 symbol
   push @{$kmer{$_}}, [$line,$pos];  # store row number and position of "word"
   $pos++;
  }
  $line++;
}

open($IN, '<', "File1");
$line=0;
while(<$IN>) {   # Read S-file
  chomp;
  next if /^>/;
  s/^\s+|\s+$//g;
  next if !$_;
  my $pos=0;
  my $len=length($_);  # length of row of S-file
  my $s=$_;            # Current row of S-file
  my @ignore=();       # array for store information about match tails
  for(/(?=(.{$k}))/g) {
    next if ! $kmer{$_};  # "word" not found try to next
    for(@{$kmer{$_}}) {   # $kmer{word} contains array of lines/positions in Q
      my($qline, $qpos)=@{$_};
#      print "test $qline:$qpos ";
      if( grep {$_->[0]==$qline && $_->[1]==$qpos } @ignore ) {
      # this line/position already tested and included in found matching
#        print "Ignore match tail $qline:$qpos\n";
        next;
      }
      my $j=$k;  # $k letters same, test after this point
      my $qlen=length($Q[$qline]);
      $j++ while( $pos+$j<$len && $qpos+$j<$qlen &&
                  substr($s,$pos+$j,1) eq substr($Q[$qline],$qpos+$j,1) );
      print "MATCH FOUND: S-file line $line pos $pos, Q-file line $qline pos $qpos: ",
            substr($s,$pos,$j),"\n";
      push @ignore, [$qline, $qpos, $j];  # store positions and length of match
    }
  } continue {  # Continue block works on all loops, include after "next"
   $pos++;
   @ignore=grep { # recalculate/filter position and length of all match tails
                  ++$_->[1];  # increment position
                  (--$_->[2]) # decrement length
                   >= $k      # and filter out lengths < $k
                } @ignore;
#   print Dumper(\@ignore);
  }
  $line++;
}