Awk 比较两个文件的列

Awk 比较两个文件的列,awk,Awk,我有两个文件,并试图在列的基础上比较这些文件 文件1 CALL_3 CALL_1 CALL_2 CALL_5 CALL_3 CALL_2 CALL_1 CALL_4 文件2 CALL_1 GAP:A GAP:G CALL_3 GAP:C GAP:Q GAP:R CALL_5 GAP:R GAP:A CALL_4 GAP:C GAP:D GAP:A GAP:W CALL_2 GAP:C GAP:R GAP:A 我只想打印文件_1中的那些交互,它们

我有两个文件,并试图在列的基础上比较这些文件

文件1

CALL_3  CALL_1
CALL_2  CALL_5
CALL_3  CALL_2
CALL_1  CALL_4
文件2

CALL_1   GAP:A  GAP:G
CALL_3   GAP:C  GAP:Q  GAP:R 
CALL_5   GAP:R  GAP:A
CALL_4   GAP:C  GAP:D  GAP:A  GAP:W
CALL_2   GAP:C  GAP:R  GAP:A
我只想打印文件_1中的那些交互,它们之间至少有一个GAP_id是comman

预期产量

CALL_2  CALL_5  GAP:A GAP:R
CALL_3  CALL_2  GAP:C GAP:R
CALL_1  CALL_4  GAP:A
我尝试了以下方法:

awk 'NR==FNR {
a[$1]=($1 OFS $2 OFS $3 OFS $4 OFS $5 OFS $6 OFS $7 OFS $8 OFS $9)
next 
}
($1 in a)&&($2 in a) {
print a[$1],a[$2]
}' File_2 File_1

它适用于固定数量的列。但文件2中的列数不是固定的(超过1000列)。如何获得预期的输出?

我是在bash中用coreutils实现的。一行:

join -12 -21 <(join -11 -21 <(sort file_1) <(sort file_2) | sort -k2) <(sort file_2) | xargs -l1 bash -c 'a=$(<<<"${@:3}" tr " " "\n" | sort | uniq -d | tr "\n" " "); if [ -n "$a" ]; then printf "%s %s %s\n" "$1" "$2" "$a"; fi' --

对于awk,这很简单:

$ awk '(NR==FNR){$1=$1;a[$1]=$0;next}
       {str=strt=$1 OFS $2}
       {split(a[$1],b,OFS)}
       {for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]} 
       (str!=strt){print str}' file2 file1
这是如何工作的:

  • (NR==FNR){$1=$1;a[$1]=$0;next}

    第一行缓冲关联数组中的
    file2
    a[key]=value
    ,其中
    key
    是第一个元素,
    value
    是整行。例如

    a["CALL_1"]="CALL_1 GAP:A GAP:G"
    
    注意,我们使用
    $1=$1
    将所有
    FS
    替换为
    OFS

  • {str=strt=$1/s$2}

    这只是将
    CALL\u 1 CALL\u 2
    存储在变量
    str

  • {split(a[$1],b,OFS)}
    :将缓冲线拆分为数组
    b

  • {for(b中的i)if(index(a[$2]OFS,OFS b[i]OFS))str=str of s a[$2]}

    对于数组
    b
    中的所有条目,检查是否在字符串
    a[$2]OFS
    中找到了字符串
    OFS b[i]OFS
    。我们添加额外的
    OFS
    ,以确保字段匹配。我们会测试像OFS CALL\u 2 OFS这样的值,但这永远不会匹配。这是一个很小的开销,但是修复它会产生更多的开销

  • 更优化的版本如下:

    $ awk '(NR==FNR){k=$1;$1="";a[k]=$1;c[k]=NF-1;next}
           {str=strt=$1 OFS $2}
           (c[$1]< c[$2]) {split(substr(a[$1],2),b,OFS);s=a[$2] OFS}
           (c[$1]>=c[$2]) {split(substr(a[$2],2),b,OFS);s=a[$1] OFS}
           {for(i in b) if(index(s, OFS b[i] OFS)) str=str OFS a[$2]} 
           (str!=strt){print str}' file2 file1
    
    $awk'(NR==FNR){k=$1;$1=“”;a[k]=$1;c[k]=NF-1;next}
    {str=strt=$1/s$2}
    (c[$1]=c[$2]){split(substr(a[$2],2),b,OFS);s=a[$1]OFS}
    {对于(b中的i)if(索引(s,ofsb[i]OFS))str=strofsa[$2]}
    (str!=strt){print str}文件2文件1
    
    请尝试以下内容

    awk '
    FNR==NR{
      val=$1
      $1=""
      $0=$0
      $1=$1
      a[val]=$0
      next
    }
    {
      val=""
      num1=split(a[$1],array1," ")
      for(i=1;i<=num1;i++){
        array3[array1[i]]
      }
      num2=split(a[$2],array2," ")
      for(i=1;i<=num2;i++){
        array4[array2[i]]
      }
      for(k in array3){
        if(k in array4){
          val=(val?val OFS:"")k
        }
      }
      if(val){
        print $0,val
      }
      val=""
      delete array1
      delete array2
      delete array3
      delete array4
    }
    '  Input_file2   Input_file1
    


    解释:添加上述代码的详细解释

    awk '                                  ##Starting awk program here.
    FNR==NR{                               ##Checking condition FNR==NR which will be TRUE for first Input_file is being read.
      val=$1                               ##Creating a variable named val whose value is $1 of current line.
      $1=""                                ##Nullifying $1 here.
      $0=$0                                ##Re-assigning value of current line to itself, so that initial space will be removed.
      $1=$1                                ##Re-assigning value of current line to itself, so that initial space will be removed.
      a[val]=$0                            ##Creating an array named a whose index is val and value is $0.
      next                                 ##next will skip all further statements from here.
    }
    {
      val=""                               ##Nullifying variable val here.
      num1=split(a[$1],array1," ")         ##splitting array a with index $1 to array1 and having its total number in num1.
      for(i=1;i<=num1;i++){                ##Starting a for loop from i=1 till value of num1
        array3[array1[i]]                  ##Creating an array named array3 with index of array1 with index i.
      }
      num2=split(a[$2],array2," ")         ##splitting array a with index $2 to array2 and having its total number in num2.
      for(i=1;i<=num2;i++){                ##Starting a for loop from i=1 till value of num2.
        array4[array2[i]]                  ##Creating an array named array4 with value of array2 with index i.
      }
      for(k in array3){                    ##Traversing through array3 here.
        if(k in array4){                   ##Checking condition if k which is index of  array3 is present in array4 then do following.
          val=(val?val OFS:"")k            ##Creating variable named val whose value is variable k with concatenating its own value each time to it.
        }
      }
      if(val){                             ##Checking condition if variable val is NOT NULL then do following.
        print $0,val                       ##Printing current line and variable val here.
      }
      val=""                               ##Nullifying variable val here.
      delete array1                        ##Deleting array1 here.
      delete array2                        ##Deleting array2 here.
      delete array3                        ##Deleting array3 here.
      delete array4                        ##Deleting array4 here.
    }
    '  Input_file2  Input_file1            ##Mentioning Input_file names here.
    
    awk'##在这里启动awk程序。
    FNR==NR{{##检查条件FNR==NR,该条件对于正在读取的第一个输入文件为真。
    val=$1##创建一个名为val的变量,其值为当前行的$1。
    $1=”“##在此处取消$1。
    $0=$0##将当前行的值重新分配给自身,以便删除初始空间。
    $1=$1##将当前行的值重新分配给自身,以便删除初始空间。
    a[val]=$0##创建一个名为a的数组,其索引为val,值为$0。
    next##next将跳过此处的所有进一步语句。
    }
    {
    val=”“##在此处使变量val无效。
    num1=拆分(a[$1],数组1,“”)##将索引为$1的数组a拆分为数组1,其总数以num1表示。
    
    对于(i=1;i和GNU awk,对于阵列阵列:

    $ cat tst.awk
    NR==FNR {
        for (i=2; i<=NF; i++) {
            gaps[$1][$i]
        }
        next
    }
    {
        common = ""
        for (gap in gaps[$1]) {
            if (gap in gaps[$2]) {
                common = common OFS gap
            }
        }
        if ( common != "" ) {
            print $0 common
        }
    }
    
    $ awk -f tst.awk file2 file1
    CALL_2  CALL_5 GAP:A GAP:R
    CALL_3  CALL_2 GAP:C GAP:R
    CALL_1  CALL_4 GAP:A
    

    那么
    CALL\u 1 CALL\u 4
    呢?它们都有间隙:A@KamilCuk谢谢,我更正了post@RaviSaroch,++ve要想得到一个好的、清晰的问题,请保持它。它会打印整个文件
    CALL_2  CALL_5 GAP:A GAP:R
    CALL_3  CALL_2 GAP:C GAP:R
    CALL_1  CALL_4 GAP:A
    
    awk '                                  ##Starting awk program here.
    FNR==NR{                               ##Checking condition FNR==NR which will be TRUE for first Input_file is being read.
      val=$1                               ##Creating a variable named val whose value is $1 of current line.
      $1=""                                ##Nullifying $1 here.
      $0=$0                                ##Re-assigning value of current line to itself, so that initial space will be removed.
      $1=$1                                ##Re-assigning value of current line to itself, so that initial space will be removed.
      a[val]=$0                            ##Creating an array named a whose index is val and value is $0.
      next                                 ##next will skip all further statements from here.
    }
    {
      val=""                               ##Nullifying variable val here.
      num1=split(a[$1],array1," ")         ##splitting array a with index $1 to array1 and having its total number in num1.
      for(i=1;i<=num1;i++){                ##Starting a for loop from i=1 till value of num1
        array3[array1[i]]                  ##Creating an array named array3 with index of array1 with index i.
      }
      num2=split(a[$2],array2," ")         ##splitting array a with index $2 to array2 and having its total number in num2.
      for(i=1;i<=num2;i++){                ##Starting a for loop from i=1 till value of num2.
        array4[array2[i]]                  ##Creating an array named array4 with value of array2 with index i.
      }
      for(k in array3){                    ##Traversing through array3 here.
        if(k in array4){                   ##Checking condition if k which is index of  array3 is present in array4 then do following.
          val=(val?val OFS:"")k            ##Creating variable named val whose value is variable k with concatenating its own value each time to it.
        }
      }
      if(val){                             ##Checking condition if variable val is NOT NULL then do following.
        print $0,val                       ##Printing current line and variable val here.
      }
      val=""                               ##Nullifying variable val here.
      delete array1                        ##Deleting array1 here.
      delete array2                        ##Deleting array2 here.
      delete array3                        ##Deleting array3 here.
      delete array4                        ##Deleting array4 here.
    }
    '  Input_file2  Input_file1            ##Mentioning Input_file names here.
    
    $ cat tst.awk
    NR==FNR {
        for (i=2; i<=NF; i++) {
            gaps[$1][$i]
        }
        next
    }
    {
        common = ""
        for (gap in gaps[$1]) {
            if (gap in gaps[$2]) {
                common = common OFS gap
            }
        }
        if ( common != "" ) {
            print $0 common
        }
    }
    
    $ awk -f tst.awk file2 file1
    CALL_2  CALL_5 GAP:A GAP:R
    CALL_3  CALL_2 GAP:C GAP:R
    CALL_1  CALL_4 GAP:A
    
    $ cat tst.awk
    NR==FNR {
        key = $1
        sub(/[^[:space:]]+[[:space:]]+/,"")
        gaps[key] = $0
        next
    }
    {
        mkSet(gaps[$1],gaps1)
        mkSet(gaps[$2],gaps2)
        common = ""
        for (gap in gaps1) {
            if (gap in gaps2) {
                common = common OFS gap
            }
        }
        if ( common != "" ) {
            print $0 common
        }
    }
    function mkSet(str,arr, i,tmp) {
        delete arr
        split(str,tmp)
        for (i in tmp) {
            arr[tmp[i]]
        }
    }
    
    $ awk -f tst.awk file2 file1
    CALL_2  CALL_5 GAP:A GAP:R
    CALL_3  CALL_2 GAP:C GAP:R
    CALL_1  CALL_4 GAP:A