如何使用linux和awks将一个字段文件拆分为多个字段文件?

如何使用linux和awks将一个字段文件拆分为多个字段文件?,awk,Awk,我有以下文件list.txt: AbateI. D AcatulloM. A AcerbiF. D AcquafrescaR. A AcquahA. C AdjapongC. D AdnanA. D AdrianoL. A AjetiA. D AlbiolR. D AldeganiG. P AleesamiH. D AlexSandro D AlissonR. P 我想用awk重新排

我有以下文件
list.txt

AbateI.       D
AcatulloM.    A
AcerbiF.      D
AcquafrescaR. A
AcquahA.      C
AdjapongC.    D
AdnanA.       D
AdrianoL.     A
AjetiA.       D
AlbiolR.      D
AldeganiG.    P
AleesamiH.    D
AlexSandro    D
AlissonR.     P
我想用
awk
重新排列文件,按第二列将它们分组,如下所示:

P                    D              C                 A
AldeganiG.         AbateI.         AcquahA.         AcatulloM. 
AlissonR.          AcerbiF.                         AcquafrescaR.
                   AdjapongC.                       AdrianoL. 
                   AdnanA. 
                   AjetiA. 
                   AlbiolR. 
                   AleesamiH.
                   AlexSandro 
这就是我所尝试的:

#!/usr/bin/awk -f

BEGIN {
FORMAT="\t%-20s%-20s%-20s%s\n"
printf FORMAT,"P","D","C","A"
}

($2=="P")  {a[$1] = $1}
($2=="D")  {b[$1] = $1}
($2=="C")  {c[$1] = $1}
($2=="A")  {d[$1] = $1}

END{for(i in a) printf FORMAT, a[i],"","",""}
但我不知道如何循环和打印其他阵列。

$cat tst.awk
$ cat tst.awk
BEGIN { OFS="\t" }
{
    rowNr = ++numColRows[$2]
    val[rowNr,$2] = $1
    numRows = (rowNr > numRows ? rowNr : numRows)
}
END {
    for (colName in numColRows) {
        printf "%s%s", (c++ ? OFS : ""), colName
    }
    print ""
    for (rowNr=1; rowNr<=numRows; rowNr++) {
        c = 0
        for (colName in numColRows) {
            printf "%s%s", (c++ ? OFS : ""), val[rowNr,colName]
        }
        print ""
    }
}

$ awk -f tst.awk file | column -s$'\t' -t
A              P           C         D
AcatulloM.     AldeganiG.  AcquahA.  AbateI.
AcquafrescaR.  AlissonR.             AcerbiF.
AdrianoL.                            AdjapongC.
                                     AdnanA.
                                     AjetiA.
                                     AlbiolR.
                                     AleesamiH.
                                     AlexSandro
开始{OFS=“\t”} { 行数=++numColRows[$2] val[rowNr,$2]=1美元 numRows=(行编号>行编号?行编号:numRows) } 结束{ for(numColRows中的colName){ printf“%s%s”,(c++?OFS:),colName } 打印“”
对于(rowNr=1;rowNr,您可以使用
paste
column
进行一些流程替换:

$ paste \
      <(awk '/P$/ {print $1}'<input) \
      <(awk '/D$/ {print $1}'<input) \
      <(awk '/C$/ {print $1}'<input) \
      <(awk '/A$/ {print $1}'<input) | column -s $'\t' -t
AldeganiG.  AbateI.     AcquahA.  AcatulloM.
AlissonR.   AcerbiF.              AcquafrescaR.
            AdjapongC.            AdrianoL.
            AdnanA.
            AjetiA.
            AlbiolR.
            AleesamiH.
            AlexSandro
$paste\

这是一种非传统的方法

$ awk -v OFS='\n' '{a[$2]=a[$2] OFS $1; 
                    c[$2]++; 
                    if(c[$2]>max) max=c[$2]} 
                END{pr="pr -"length(c)"t"; 
                    for(k in a) 
                       {print k a[k] | pr; 
                        for(i=c[k];i<max;i++) 
                           {print ""  | pr}}}'

A                 P                 C                 D
AcatulloM.        AldeganiG.        AcquahA.          AbateI.
AcquafrescaR.     AlissonR.                           AcerbiF.
AdrianoL.                                             AdjapongC.
                                                      AdnanA.
                                                      AjetiA.
                                                      AlbiolR.
                                                      AleesamiH.
                                                      AlexSandro
$awk-vofs='\n'{a[$2]=a[$2]OFS$1;
c[$2]++;
如果(c[$2]>max)max=c[$2]}
结束{pr=“pr-”长度(c)“t”;
对于(a中的k)
{打印ka[k]| pr;

对于(i=c[k];i您也可以使用
grep剪切粘贴展开
组合

paste \
   <(echo "P";grep 'P$' list.txt |cut -d ' ' -f1 ) \
   <(echo "D";grep 'D$' list.txt |cut -d ' ' -f1 ) \
   <(echo "C";grep 'C$' list.txt |cut -d ' ' -f1 ) \
   <(echo "A";grep 'A$' list.txt |cut -d ' ' -f1) | expand -t 20
您可以将
grep cut
替换为
sed
,如下所示

paste \
    <(echo "P";sed -n '/P$/{s/[[:blank:]]*P$//;p}' file ) \
    <(echo "D";sed -n '/D$/{s/[[:blank:]]*D$//;p}' file ) \
    <(echo "C";sed -n '/C$/{s/[[:blank:]]*C$//;p}' file ) \
    <(echo "A";sed -n '/A$/{s/[[:blank:]]*A$//;p}' file ) | expand -t 20
你也可以这样做

paste \
     <(awk 'BEGIN{print "P"}/P$/{print $1}' file )
     <(awk 'BEGIN{print "D"}/D$/{print $1}' file )
     <(awk 'BEGIN{print "C"}/C$/{print $1}' file )
     <(awk 'BEGIN{print "A"}/A$/{print $1}' file ) | expand -t 20
在GNU awk中:

$ cat > list.awk
{
    n=(n<++b[$2]?b[$2]:n)                # n is the max count of words in one group
    a[$2][b[$2]]=$1                      # put words to two dimensional array
} 
END {
    for(i=1;i<=n;i++) {                  # from 1 to n
        for(j in a)                      # for all groups
            printf "%14-s%s",a[j][i],OFS # print a word
        printf "%s",ORS                  # ORS in the end
    }
}
$ -f list.awk list.txt
AcatulloM.     AldeganiG.     AcquahA.       AbateI.        
AcquafrescaR.  AlissonR.                     AcerbiF.       
AdrianoL.                                    AdjapongC.     
                                             AdnanA.        
                                             AjetiA.        
                                             AlbiolR.       
                                             AleesamiH.     
                                             AlexSandro     
$cat>list.awk
{

n=(n使用awk 4.0 2D阵列的解决方案-允许以任何顺序输出任意数量的组

# output order of groups
order=$*
awk -vorderstr="$order" '
BEGIN { split(orderstr, order) }
{
# grpnames[group][index]=name
  grpnames[$2][grpi[$2]++]=$1
# track max group size
  if(grpi[$2] > maxgrpsz)
    maxgrpsz=grpi[$2]
}
END {
# print groups header in order
printf("%-20s", order[1])
for(j=2; j <= length(order); ++j) {
  printf("\t%-20s", order[j])
}
printf("\n")
for(i=0; i < maxgrpsz; ++i) {
# run across each group in output order
  printf("%-20s", grpnames[order[1]][i])
  for(j=2; j <= length(order); ++j) {
    grp=order[j]
    printf("\t%-20s", grpnames[grp][i])
  }
  printf("\n")
}
}
'
#组的输出顺序
命令=$*
awk-vorderstr=“$order””
开始{split(orderstr,order)}
{
#GRP名称[组][索引]=名称
GRP名称[$2][grpi[$2]+]=$1
#跟踪最大组大小
如果(grpi[$2]>maxgrpsz)
maxgrpsz=grpi[$2]
}
结束{
#按顺序打印组标题
printf(“%-20s”,订单[1])

对于(j=2;j创建4个数组,并将
$1
添加到与
$2
对应的数组中。然后在最后,从行上的每个数组中打印一个元素,并保持循环,直到最长数组中的条目用完。我们不会为您编写它,这不是免费的编码服务。因此,请使用该提示,尝试实现它,然后返回b。)如果你不能让它工作,我会写它。@Barmar如果我觉得这个问题有趣,我会写它,就像我在这个例子中所做的那样。
P                   D                   C                   A
AldeganiG.          AbateI.             AcquahA.            AcatulloM.
AlissonR.           AcerbiF.                                AcquafrescaR.
                    AdjapongC.                              AdrianoL.
                    AdnanA.                                 
                    AjetiA.                                 
                    AlbiolR.                                
                    AleesamiH.                              
                    AlexSandro                              
$ cat > list.awk
{
    n=(n<++b[$2]?b[$2]:n)                # n is the max count of words in one group
    a[$2][b[$2]]=$1                      # put words to two dimensional array
} 
END {
    for(i=1;i<=n;i++) {                  # from 1 to n
        for(j in a)                      # for all groups
            printf "%14-s%s",a[j][i],OFS # print a word
        printf "%s",ORS                  # ORS in the end
    }
}
$ -f list.awk list.txt
AcatulloM.     AldeganiG.     AcquahA.       AbateI.        
AcquafrescaR.  AlissonR.                     AcerbiF.       
AdrianoL.                                    AdjapongC.     
                                             AdnanA.        
                                             AjetiA.        
                                             AlbiolR.       
                                             AleesamiH.     
                                             AlexSandro     
# output order of groups
order=$*
awk -vorderstr="$order" '
BEGIN { split(orderstr, order) }
{
# grpnames[group][index]=name
  grpnames[$2][grpi[$2]++]=$1
# track max group size
  if(grpi[$2] > maxgrpsz)
    maxgrpsz=grpi[$2]
}
END {
# print groups header in order
printf("%-20s", order[1])
for(j=2; j <= length(order); ++j) {
  printf("\t%-20s", order[j])
}
printf("\n")
for(i=0; i < maxgrpsz; ++i) {
# run across each group in output order
  printf("%-20s", grpnames[order[1]][i])
  for(j=2; j <= length(order); ++j) {
    grp=order[j]
    printf("\t%-20s", grpnames[grp][i])
  }
  printf("\n")
}
}
'
./myscr.sh P D C A <in.txt
P                       D                       C                       A
AldeganiG.              AbateI.                 AcquahA.                AcatulloM.
AlissonR.               AcerbiF.                                        AcquafrescaR.
                        AdjapongC.                                      AdrianoL.
                        AdnanA.
                        AjetiA.
                        AlbiolR.
                        AleesamiH.
                        AlexSandro
./myscr.sh D A P C <in.txt
D                       A                       P                       C
AbateI.                 AcatulloM.              AldeganiG.              AcquahA.
AcerbiF.                AcquafrescaR.           AlissonR.
AdjapongC.              AdrianoL.
AdnanA.
AjetiA.
AlbiolR.
AleesamiH.
AlexSandro

./myscr.sh A P <in.txt
A                       P
AcatulloM.              AldeganiG.
AcquafrescaR.           AlissonR.
AdrianoL.