Awk 比较文件和打印类_Awk - Fatal编程技术网

Awk 比较文件和打印类

awk

Awk 比较文件和打印类,awk,Awk,我有文件1：文件2 id class position1 position2 a1 Xfact 1 40 a1 Xred 41 66 a1 xbreak 69 89 b1 Xbreak 77 133 b1 Xred 140 199 c1 Xfact 1 15 c1 Xbreak 19 35 我想要这样的东西输出：我需

我有文件1：

文件2

id  class  position1 position2
a1  Xfact   1           40
a1  Xred    41          66
a1  xbreak  69          89
b1  Xbreak  77          133
b1  Xred    140         199
c1  Xfact   1           15
c1  Xbreak  19          35

我想要这样的东西输出：

我需要一个简单的awk脚本，它打印文件1中的id和位置，从文件1中获取位置，并将其与文件2的位置进行比较。如果文件1中的位置位于文件2中的位置1和2的范围内。使用

awk

单向打印相应的类。这不是一个简单的脚本。简而言之，过程说明：关键点是变量'all_ranges'，当重置从保存其数据的范围文件中读取时，以及当设置时，停止该过程并从“id位置”开始读取时文件，检查数组数据中的位置，并打印是否与范围匹配。我已经尝试过多次避免处理范围文件，而是分块处理，这使得它更加复杂

编辑添加我假设两个文件中的

id

字段都已排序。否则，这个脚本将惨败，您将需要另一种方法

script.awk的内容

：

BEGIN {
    ## Arguments:
    ## ARGV[0] = awk
    ## ARGV[1] = <first_input_argument>
    ## ARGV[2] = <second_input_argument>
    ## ARGC = 3
    f2 = ARGV[ --ARGC ];

    all_ranges = 0

    ## Read first line from file with ranges to get 'class' header.
    getline line <f2
    split( line, fields )
    class_header = fields[2];
}

## Special case for the header.
FNR == 1 {
    printf "%s\t%s\n", $0, class_header;
    next;
}

## Data.
FNR > 1 {

    while ( 1 ) {

        if ( ! all_ranges ) {

            ## Read line from file with range positions.
            ret = getline line <f2

            ## Check error.
            if ( ret == -1 ) {
                printf "%s\n", "ERROR: " ERRNO
                close( f2 );
                exit 1;
            }

            ## Check end of file.
            if ( ret == 0 ) {
                break;
            }

            ## Split line in spaces.
            num = split( line, fields )
            if ( num != 4 ) {
                printf "%s\n", "ERROR: Bad format of file " f2;
                exit 2;
            }

            range_id = fields[1];
            if ( $1 == fields[1] ) {
                ranges[ fields[3], fields[4] ] = fields[2];
                continue;
            }
            else {
                all_ranges = 1
            }
        }

        if ( range_id == $1 ) {
            delete ranges;
            ranges[ fields[3], fields[4] ] = fields[2];
            all_ranges = 0;
            continue;
        }        

        for ( range in ranges ) {
            split( range, pos, SUBSEP )
            if ( $2 >= pos[1] && $2 <= pos[2] ) {
                printf "%s\t%s\n", $0, ranges[ range ];
                break;
            }  
        }
        break;
    }
}

END {
    for ( range in ranges ) {
        split( range, pos, SUBSEP )
        if ( $2 >= pos[1] && $2 <= pos[2] ) {
            printf "%s\t%s\n", $0, ranges[ range ];
            break;
        }  
    }
}

结果如下：

id  position  class
a1  21        Xfact
a1  39        Xfact
a1  77        xbreak
b1  88        Xbreak
b1  122       Xbreak
c1  22        Xbreak

这是家庭作业吗？看起来很像。

BEGIN {
    ## Arguments:
    ## ARGV[0] = awk
    ## ARGV[1] = <first_input_argument>
    ## ARGV[2] = <second_input_argument>
    ## ARGC = 3
    f2 = ARGV[ --ARGC ];

    all_ranges = 0

    ## Read first line from file with ranges to get 'class' header.
    getline line <f2
    split( line, fields )
    class_header = fields[2];
}

## Special case for the header.
FNR == 1 {
    printf "%s\t%s\n", $0, class_header;
    next;
}

## Data.
FNR > 1 {

    while ( 1 ) {

        if ( ! all_ranges ) {

            ## Read line from file with range positions.
            ret = getline line <f2

            ## Check error.
            if ( ret == -1 ) {
                printf "%s\n", "ERROR: " ERRNO
                close( f2 );
                exit 1;
            }

            ## Check end of file.
            if ( ret == 0 ) {
                break;
            }

            ## Split line in spaces.
            num = split( line, fields )
            if ( num != 4 ) {
                printf "%s\n", "ERROR: Bad format of file " f2;
                exit 2;
            }

            range_id = fields[1];
            if ( $1 == fields[1] ) {
                ranges[ fields[3], fields[4] ] = fields[2];
                continue;
            }
            else {
                all_ranges = 1
            }
        }

        if ( range_id == $1 ) {
            delete ranges;
            ranges[ fields[3], fields[4] ] = fields[2];
            all_ranges = 0;
            continue;
        }        

        for ( range in ranges ) {
            split( range, pos, SUBSEP )
            if ( $2 >= pos[1] && $2 <= pos[2] ) {
                printf "%s\t%s\n", $0, ranges[ range ];
                break;
            }  
        }
        break;
    }
}

END {
    for ( range in ranges ) {
        split( range, pos, SUBSEP )
        if ( $2 >= pos[1] && $2 <= pos[2] ) {
            printf "%s\t%s\n", $0, ranges[ range ];
            break;
        }  
    }
}

awk -f script.awk file1 file2 | column -t

id  position  class
a1  21        Xfact
a1  39        Xfact
a1  77        xbreak
b1  88        Xbreak
b1  122       Xbreak
c1  22        Xbreak