Awk 比较文件和打印类
我有 文件1: 文件2Awk 比较文件和打印类,awk,Awk,我有 文件1: 文件2 id class position1 position2 a1 Xfact 1 40 a1 Xred 41 66 a1 xbreak 69 89 b1 Xbreak 77 133 b1 Xred 140 199 c1 Xfact 1 15 c1 Xbreak 19 35 我想要这样的东西 输出: 我需
id class position1 position2
a1 Xfact 1 40
a1 Xred 41 66
a1 xbreak 69 89
b1 Xbreak 77 133
b1 Xred 140 199
c1 Xfact 1 15
c1 Xbreak 19 35
我想要这样的东西
输出:
我需要一个简单的awk脚本,它打印文件1中的id和位置,从文件1中获取位置,并将其与文件2的位置进行比较。如果文件1中的位置位于文件2中的位置1和2的范围内。使用
awk
单向打印相应的类。这不是一个简单的脚本。简而言之,过程说明:关键点是变量'all_ranges',当重置从保存其数据的范围文件中读取时,以及当设置时,停止该过程并从“id位置”开始读取时
文件,检查数组数据中的位置,并打印是否与范围匹配。我已经尝试过多次避免处理范围文件,而是分块处理,这使得它更加复杂
编辑添加我假设两个文件中的id
字段都已排序。否则,这个脚本将惨败,您将需要另一种方法
script.awk的内容
:
BEGIN {
## Arguments:
## ARGV[0] = awk
## ARGV[1] = <first_input_argument>
## ARGV[2] = <second_input_argument>
## ARGC = 3
f2 = ARGV[ --ARGC ];
all_ranges = 0
## Read first line from file with ranges to get 'class' header.
getline line <f2
split( line, fields )
class_header = fields[2];
}
## Special case for the header.
FNR == 1 {
printf "%s\t%s\n", $0, class_header;
next;
}
## Data.
FNR > 1 {
while ( 1 ) {
if ( ! all_ranges ) {
## Read line from file with range positions.
ret = getline line <f2
## Check error.
if ( ret == -1 ) {
printf "%s\n", "ERROR: " ERRNO
close( f2 );
exit 1;
}
## Check end of file.
if ( ret == 0 ) {
break;
}
## Split line in spaces.
num = split( line, fields )
if ( num != 4 ) {
printf "%s\n", "ERROR: Bad format of file " f2;
exit 2;
}
range_id = fields[1];
if ( $1 == fields[1] ) {
ranges[ fields[3], fields[4] ] = fields[2];
continue;
}
else {
all_ranges = 1
}
}
if ( range_id == $1 ) {
delete ranges;
ranges[ fields[3], fields[4] ] = fields[2];
all_ranges = 0;
continue;
}
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
break;
}
}
END {
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
}
结果如下:
id position class
a1 21 Xfact
a1 39 Xfact
a1 77 xbreak
b1 88 Xbreak
b1 122 Xbreak
c1 22 Xbreak
这是家庭作业吗?看起来很像。
BEGIN {
## Arguments:
## ARGV[0] = awk
## ARGV[1] = <first_input_argument>
## ARGV[2] = <second_input_argument>
## ARGC = 3
f2 = ARGV[ --ARGC ];
all_ranges = 0
## Read first line from file with ranges to get 'class' header.
getline line <f2
split( line, fields )
class_header = fields[2];
}
## Special case for the header.
FNR == 1 {
printf "%s\t%s\n", $0, class_header;
next;
}
## Data.
FNR > 1 {
while ( 1 ) {
if ( ! all_ranges ) {
## Read line from file with range positions.
ret = getline line <f2
## Check error.
if ( ret == -1 ) {
printf "%s\n", "ERROR: " ERRNO
close( f2 );
exit 1;
}
## Check end of file.
if ( ret == 0 ) {
break;
}
## Split line in spaces.
num = split( line, fields )
if ( num != 4 ) {
printf "%s\n", "ERROR: Bad format of file " f2;
exit 2;
}
range_id = fields[1];
if ( $1 == fields[1] ) {
ranges[ fields[3], fields[4] ] = fields[2];
continue;
}
else {
all_ranges = 1
}
}
if ( range_id == $1 ) {
delete ranges;
ranges[ fields[3], fields[4] ] = fields[2];
all_ranges = 0;
continue;
}
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
break;
}
}
END {
for ( range in ranges ) {
split( range, pos, SUBSEP )
if ( $2 >= pos[1] && $2 <= pos[2] ) {
printf "%s\t%s\n", $0, ranges[ range ];
break;
}
}
}
awk -f script.awk file1 file2 | column -t
id position class
a1 21 Xfact
a1 39 Xfact
a1 77 xbreak
b1 88 Xbreak
b1 122 Xbreak
c1 22 Xbreak