Awk 比较两个文件的列_Awk - Fatal编程技术网

Awk 比较两个文件的列

awk

Awk 比较两个文件的列,awk,Awk,我有两个文件，并试图在列的基础上比较这些文件文件1 CALL_3 CALL_1 CALL_2 CALL_5 CALL_3 CALL_2 CALL_1 CALL_4 文件2 CALL_1 GAP:A GAP:G CALL_3 GAP:C GAP:Q GAP:R CALL_5 GAP:R GAP:A CALL_4 GAP:C GAP:D GAP:A GAP:W CALL_2 GAP:C GAP:R GAP:A 我只想打印文件_1中的那些交互，它们

我有两个文件，并试图在列的基础上比较这些文件

文件1

CALL_3  CALL_1
CALL_2  CALL_5
CALL_3  CALL_2
CALL_1  CALL_4

文件2

CALL_1   GAP:A  GAP:G
CALL_3   GAP:C  GAP:Q  GAP:R 
CALL_5   GAP:R  GAP:A
CALL_4   GAP:C  GAP:D  GAP:A  GAP:W
CALL_2   GAP:C  GAP:R  GAP:A

我只想打印文件_1中的那些交互，它们之间至少有一个GAP_id是comman

预期产量

CALL_2  CALL_5  GAP:A GAP:R
CALL_3  CALL_2  GAP:C GAP:R
CALL_1  CALL_4  GAP:A

我尝试了以下方法：

awk 'NR==FNR {
a[$1]=($1 OFS $2 OFS $3 OFS $4 OFS $5 OFS $6 OFS $7 OFS $8 OFS $9)
next 
}
($1 in a)&&($2 in a) {
print a[$1],a[$2]
}' File_2 File_1

它适用于固定数量的列。但文件2中的列数不是固定的（超过1000列）。如何获得预期的输出？

我是在bash中用coreutils实现的。一行：

join -12 -21 <(join -11 -21 <(sort file_1) <(sort file_2) | sort -k2) <(sort file_2) | xargs -l1 bash -c 'a=$(<<<"${@:3}" tr " " "\n" | sort | uniq -d | tr "\n" " "); if [ -n "$a" ]; then printf "%s %s %s\n" "$1" "$2" "$a"; fi' --

对于awk，这很简单：

$ awk '(NR==FNR){$1=$1;a[$1]=$0;next}
       {str=strt=$1 OFS $2}
       {split(a[$1],b,OFS)}
       {for(i in b) if(index(a[$2] OFS, OFS b[i] OFS)) str=str OFS a[$2]} 
       (str!=strt){print str}' file2 file1

这是如何工作的：

（NR==FNR）{$1=$1；a[$1]=$0；next}

第一行缓冲关联数组中的

file2

，

a[key]=value

，其中

key

是第一个元素，

value

是整行。例如

a["CALL_1"]="CALL_1 GAP:A GAP:G"

注意，我们使用

$1=$1

将所有

FS

替换为

OFS

{str=strt=$1/s$2}

这只是将

CALL\u 1 CALL\u 2

存储在变量

str

{split（a[$1]，b，OFS）}
：将缓冲线拆分为数组

{for（b中的i）if（index（a[$2]OFS，OFS b[i]OFS））str=str of s a[$2]}

对于数组
b
中的所有条目，检查是否在字符串
a[$2]OFS
中找到了字符串
OFS b[i]OFS
。我们添加额外的
OFS
，以确保字段匹配。我们会测试像OFS CALL\u 2 OFS这样的值，但这永远不会匹配。这是一个很小的开销，但是修复它会产生更多的开销

更优化的版本如下：

$ awk '(NR==FNR){k=$1;$1="";a[k]=$1;c[k]=NF-1;next} {str=strt=$1 OFS $2} (c[$1]< c[$2]) {split(substr(a[$1],2),b,OFS);s=a[$2] OFS} (c[$1]>=c[$2]) {split(substr(a[$2],2),b,OFS);s=a[$1] OFS} {for(i in b) if(index(s, OFS b[i] OFS)) str=str OFS a[$2]} (str!=strt){print str}' file2 file1

$awk'（NR==FNR）{k=$1；$1=“”；a[k]=$1；c[k]=NF-1；next} {str=strt=$1/s$2} （c[$1]=c[$2]）{split（substr（a[$2]，2），b，OFS）；s=a[$1]OFS} {对于（b中的i）if（索引（s，ofsb[i]OFS））str=strofsa[$2]} （str！=strt）{print str}文件2文件1
请尝试以下内容

awk ' FNR==NR{ val=$1 $1="" $0=$0 $1=$1 a[val]=$0 next } { val="" num1=split(a[$1],array1," ") for(i=1;i<=num1;i++){ array3[array1[i]] } num2=split(a[$2],array2," ") for(i=1;i<=num2;i++){ array4[array2[i]] } for(k in array3){ if(k in array4){ val=(val?val OFS:"")k } } if(val){ print $0,val } val="" delete array1 delete array2 delete array3 delete array4 } ' Input_file2 Input_file1

解释：添加上述代码的详细解释

awk ' ##Starting awk program here. FNR==NR{ ##Checking condition FNR==NR which will be TRUE for first Input_file is being read. val=$1 ##Creating a variable named val whose value is $1 of current line. $1="" ##Nullifying $1 here. $0=$0 ##Re-assigning value of current line to itself, so that initial space will be removed. $1=$1 ##Re-assigning value of current line to itself, so that initial space will be removed. a[val]=$0 ##Creating an array named a whose index is val and value is $0. next ##next will skip all further statements from here. } { val="" ##Nullifying variable val here. num1=split(a[$1],array1," ") ##splitting array a with index $1 to array1 and having its total number in num1. for(i=1;i<=num1;i++){ ##Starting a for loop from i=1 till value of num1 array3[array1[i]] ##Creating an array named array3 with index of array1 with index i. } num2=split(a[$2],array2," ") ##splitting array a with index $2 to array2 and having its total number in num2. for(i=1;i<=num2;i++){ ##Starting a for loop from i=1 till value of num2. array4[array2[i]] ##Creating an array named array4 with value of array2 with index i. } for(k in array3){ ##Traversing through array3 here. if(k in array4){ ##Checking condition if k which is index of array3 is present in array4 then do following. val=(val?val OFS:"")k ##Creating variable named val whose value is variable k with concatenating its own value each time to it. } } if(val){ ##Checking condition if variable val is NOT NULL then do following. print $0,val ##Printing current line and variable val here. } val="" ##Nullifying variable val here. delete array1 ##Deleting array1 here. delete array2 ##Deleting array2 here. delete array3 ##Deleting array3 here. delete array4 ##Deleting array4 here. } ' Input_file2 Input_file1 ##Mentioning Input_file names here.

awk'##在这里启动awk程序。 FNR==NR{{##检查条件FNR==NR，该条件对于正在读取的第一个输入文件为真。 val=$1##创建一个名为val的变量，其值为当前行的$1。 $1=”“##在此处取消$1。 $0=$0##将当前行的值重新分配给自身，以便删除初始空间。 $1=$1##将当前行的值重新分配给自身，以便删除初始空间。 a[val]=$0##创建一个名为a的数组，其索引为val，值为$0。 next##next将跳过此处的所有进一步语句。 } { val=”“##在此处使变量val无效。 num1=拆分（a[$1]，数组1，“”）##将索引为$1的数组a拆分为数组1，其总数以num1表示。对于（i=1；i和GNU awk，对于阵列阵列： $ cat tst.awk NR==FNR { for (i=2; i<=NF; i++) { gaps[$1][$i] } next } { common = "" for (gap in gaps[$1]) { if (gap in gaps[$2]) { common = common OFS gap } } if ( common != "" ) { print $0 common } } $ awk -f tst.awk file2 file1 CALL_2 CALL_5 GAP:A GAP:R CALL_3 CALL_2 GAP:C GAP:R CALL_1 CALL_4 GAP:A 那么CALL\u 1 CALL\u 4呢？它们都有间隙：A@KamilCuk谢谢，我更正了post@RaviSaroch，++ve要想得到一个好的、清晰的问题，请保持它。它会打印整个文件 CALL_2 CALL_5 GAP:A GAP:R CALL_3 CALL_2 GAP:C GAP:R CALL_1 CALL_4 GAP:A awk ' ##Starting awk program here. FNR==NR{ ##Checking condition FNR==NR which will be TRUE for first Input_file is being read. val=$1 ##Creating a variable named val whose value is $1 of current line. $1="" ##Nullifying $1 here. $0=$0 ##Re-assigning value of current line to itself, so that initial space will be removed. $1=$1 ##Re-assigning value of current line to itself, so that initial space will be removed. a[val]=$0 ##Creating an array named a whose index is val and value is $0. next ##next will skip all further statements from here. } { val="" ##Nullifying variable val here. num1=split(a[$1],array1," ") ##splitting array a with index $1 to array1 and having its total number in num1. for(i=1;i<=num1;i++){ ##Starting a for loop from i=1 till value of num1 array3[array1[i]] ##Creating an array named array3 with index of array1 with index i. } num2=split(a[$2],array2," ") ##splitting array a with index $2 to array2 and having its total number in num2. for(i=1;i<=num2;i++){ ##Starting a for loop from i=1 till value of num2. array4[array2[i]] ##Creating an array named array4 with value of array2 with index i. } for(k in array3){ ##Traversing through array3 here. if(k in array4){ ##Checking condition if k which is index of array3 is present in array4 then do following. val=(val?val OFS:"")k ##Creating variable named val whose value is variable k with concatenating its own value each time to it. } } if(val){ ##Checking condition if variable val is NOT NULL then do following. print $0,val ##Printing current line and variable val here. } val="" ##Nullifying variable val here. delete array1 ##Deleting array1 here. delete array2 ##Deleting array2 here. delete array3 ##Deleting array3 here. delete array4 ##Deleting array4 here. } ' Input_file2 Input_file1 ##Mentioning Input_file names here. $ cat tst.awk NR==FNR { for (i=2; i<=NF; i++) { gaps[$1][$i] } next } { common = "" for (gap in gaps[$1]) { if (gap in gaps[$2]) { common = common OFS gap } } if ( common != "" ) { print $0 common } } $ awk -f tst.awk file2 file1 CALL_2 CALL_5 GAP:A GAP:R CALL_3 CALL_2 GAP:C GAP:R CALL_1 CALL_4 GAP:A $ cat tst.awk NR==FNR { key = $1 sub(/[^[:space:]]+[[:space:]]+/,"") gaps[key] = $0 next } { mkSet(gaps[$1],gaps1) mkSet(gaps[$2],gaps2) common = "" for (gap in gaps1) { if (gap in gaps2) { common = common OFS gap } } if ( common != "" ) { print $0 common } } function mkSet(str,arr, i,tmp) { delete arr split(str,tmp) for (i in tmp) { arr[tmp[i]] } } $ awk -f tst.awk file2 file1 CALL_2 CALL_5 GAP:A GAP:R CALL_3 CALL_2 GAP:C GAP:R CALL_1 CALL_4 GAP:A