For loop 比较两个输入值重复的文件

For loop 比较两个输入值重复的文件,for-loop,if-statement,awk,For Loop,If Statement,Awk,我有以下两个文件 BC.txt PB.txt 我正在尝试比较BC.txt的Col1和PB.txt的Col12,并将匹配项打印在一起。对于BC.txt的col1中的相同值,col2和Col3中的值不同。因此,在比较时,我只得到BC.txt的一个条目的输出。但我想要所有的 awk 'BEGIN {OFS=FS} NR==FNR {a[$1]=($2" "$3);next} $12 in a {print $0,a[$12]}' BC.txt PB.txt 预期产量 我想比较BC.txt和P

我有以下两个文件

BC.txt PB.txt 我正在尝试比较BC.txt的Col1和PB.txt的Col12,并将匹配项打印在一起。对于BC.txt的col1中的相同值,col2和Col3中的值不同。因此,在比较时,我只得到BC.txt的一个条目的输出。但我想要所有的

    awk 'BEGIN {OFS=FS} NR==FNR {a[$1]=($2" "$3);next} $12 in a {print $0,a[$12]}' BC.txt PB.txt
预期产量
我想比较BC.txt和PB.txt的所有条目;但由于它的值相同,我的代码无法工作。

请尝试以下内容(仅使用您提供的示例进行测试)

awk'
FNR==NR{
a[++计数]=$0
b[计数]=12美元
下一个
}
{

对于(i=1;i如果您不关心与问题中的预期输出相比的输出行顺序,则将BC.txt读入内存,因为它更简洁:

$ cat tst.awk
NR==FNR {
    map[$1,++cnt[$1]] = $2 OFS $3
    next
}
{
    for (c=1; c<=cnt[$12]; c++) {
        print $0, map[$12,c]
    }
}

$ awk -f tst.awk BC.txt PB.txt
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
$cat tst.awk
NR==FNR{
地图[$1,++cnt[$1]]=$2/3
下一个
}
{

对于(c=1;c您可以使用
join
来执行此操作吗?如果列被排序,或者我的文件都是大的。它仍然在运行。但是初始输出指示它的工作是完全符合我的需要的。它们是多大的?是的,从每个行上运行循环都需要时间。我没有考虑文件是大的,所以测试并创建了提供的样本。我会在早上回答,这里已经很晚了:)这将是非常慢的,因为对于BC.txt的每一行,它都会在PB.txt的每一行中循环,并在PB.txt的那一行上再次进行字段拆分。而且-我没有看到在初始化后使用
b[]
。您需要添加“排序”因为join需要排序的输入,所以每个文件都有。join也是我的第一个想法,但是当你需要对输入进行排序,然后通过管道到awk来选择/重新排序字段时,它似乎不是正确的方法。也许是这样,idk。我想对于小文件,awk两次传递是一种享受。我习惯于我们从来没有使用过的生物信息学文件有足够的内存来存储任何东西。@who Join博士非常棒,而且很有帮助;但是由于我需要对文件进行排序,所以我丢失了顺序(这有时很重要)。因此,我正在寻找一个awk解决方案。
    awk 'BEGIN {OFS=FS} NR==FNR {a[$1]=($2" "$3);next} $12 in a {print $0,a[$12]}' BC.txt PB.txt
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
awk '
FNR==NR{
  a[++count]=$0
  b[count]=$12
  next
}
{
  for(i=1;i<=count;i++){
    split(a[i],array," ")
    if($1==array[12]){
       print a[i],$2,$3
    }
  }
}'  PB.txt BC.txt
awk '                         ##Starting awk program here.
FNR==NR{                      ##Checking condition FNR==NR which will be TRUE when PB.txt is being read.
  a[++count]=$0               ##Creating an array named a whose index is variable count with incrment value of 1 and value is current line.
  b[count]=$12                ##Creating an array named b whose  index is variabe count and value if 12th column.
  next                        ##next will skip all further statements from here.
}
{
  for(i=1;i<=count;i++){      ##Starting a for loop from here from i=1 to till value of count.
    split(a[i],array," ")     ##Splitting value of a[i] into array named array whose delimiter is space.
    if($1==array[12]){        ##Checking condition if $1 is equal to array[12] then do following.
       print a[i],$2,$3       ##Printing array a value along with 2nd and 3rd column value.
    }
  }
}'  PB.txt BC.txt             ##Mentioning Input_files names here.
$ cat tst.awk
NR==FNR {
    map[$1,++cnt[$1]] = $2 OFS $3
    next
}
{
    for (c=1; c<=cnt[$12]; c++) {
        print $0, map[$12,c]
    }
}

$ awk -f tst.awk BC.txt PB.txt
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
$ cat tst.awk
NR==FNR {
    map[$12,++cnt[$12]] = $0
    next
}
{
    for (c=1; c<=cnt[$1]; c++) {
        print map[$1,c], $2, $3
    }
}

$ awk -f tst.awk PB.txt BC.txt
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AGCGGCCT; BC=TTTCAGCGCCGA;
c4  PB  tr  41258945    41270445    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41258945    41259026    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41259626    41259754    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
c4  PB  Ex  41262664    41262814    .   +   .   g_i "PB.50262"; t_i "PB.50262.10"; UMI=AAGCGGCC; BC=TTTCAGCGCCGA;
$ join BC.txt <(awk '{print $12,$0}' PB.txt) | cut -d' ' -f 4-
c4 PB tr 41258945 41270445 . + . g_i "PB.50262"; t_i "PB.50262.10";
c4 PB Ex 41258945 41259026 . + . g_i "PB.50262"; t_i "PB.50262.10";
c4 PB Ex 41259626 41259754 . + . g_i "PB.50262"; t_i "PB.50262.10";
c4 PB Ex 41262664 41262814 . + . g_i "PB.50262"; t_i "PB.50262.10";
c4 PB tr 41258945 41270445 . + . g_i "PB.50262"; t_i "PB.50262.10";
c4 PB Ex 41258945 41259026 . + . g_i "PB.50262"; t_i "PB.50262.10";
c4 PB Ex 41259626 41259754 . + . g_i "PB.50262"; t_i "PB.50262.10";
c4 PB Ex 41262664 41262814 . + . g_i "PB.50262"; t_i "PB.50262.10";