如何使用Awk或Bash在一个文件中组合具有相同标题的列_Bash_Awk

如何使用Awk或Bash在一个文件中组合具有相同标题的列

bash awk

如何使用Awk或Bash在一个文件中组合具有相同标题的列,bash,awk,Bash,Awk,我想知道如何使用bash/sed/awk在文件中组合具有重复标题的列 x y x y s1 3 4 6 10 s2 3 9 10 7 s3 7 1 3 2 致：毫无疑问，有更好的方式显示标题，但我的awk有点粗略毫无疑问，有更好的方式显示标题，但我的awk有点粗略这不是一行。您可以使用bashv4、Bash的dictonary和一些shell工具来实现这一点使用文件名执行下面的脚本以处理参数 bash script_below.sh your_file 以下是脚本：

我想知道如何使用bash/sed/awk在文件中组合具有重复标题的列

   x y  x  y
s1 3 4  6 10
s2 3 9 10  7
s3 7 1  3  2

致：

毫无疑问，有更好的方式显示标题，但我的

awk

有点粗略

毫无疑问，有更好的方式显示标题，但我的

awk

有点粗略

这不是一行。您可以使用bashv4、Bash的dictonary和一些shell工具来实现这一点

使用文件名执行下面的脚本以处理参数

bash script_below.sh your_file

以下是脚本：

declare -A coltofield
headerdone=0

# Take the first line of the input file and extract all fields 
# and their position. Start with position value 2 because of the 
# format of the following lines

while read line; do
    colnum=$(echo $line | cut -d "=" -f 1)
    field=$(echo $line | cut -d "=" -f 2)

    coltofield[$colnum]=$field
done < <(head -n 1 $1 | sed  -e 's/^[[:space:]]*//;' -e 's/[[:space:]]*$//;' -e 's/[[:space:]]\+/\n/g;' | nl -v 2 -n ln  | sed -e 's/[[:space:]]\+/=/g;')

# Read the rest of the file starting with the second line             
while read line; do
    declare -A computation
    declare varname


    # Turn the line in key value pair. The key is the position of 
    # the value in the line
    while read value; do
        vcolnum=$(echo $value | cut -d "=" -f 1)
        vvalue=$(echo $value | cut -d "=" -f 2)

        # The first value is the line variable name 
        # (s1, s2)                                       
        if [[ $vcolnum == "1" ]]; then
            varname=$vvalue
            continue
        fi

        # Get the name of the field by the column 
        # position                                                     
        field=${coltofield[$vcolnum]}

        # Add the value to the current sum for this field
        computation[$field]=$((computation[$field]+${vvalue}))
    done < <(echo $line | sed  -e 's/^[[:space:]]*//;' -e 's/[[:space:]]*$//;' -e 's/[[:space:]]\+/\n/g;' | nl -n ln  | sed -e 's/[[:space:]]\+/=/g;')


    if [[ $headerdone == "0" ]]; then
        echo -e -n "\t"
        for key in ${!computation[@]}; do echo -n -e "$key\t" ; done; echo
        headerdone=1
    fi

    echo -n -e "$varname\t"
    for value in ${computation[@]}; do echo -n -e "$value\t"; done; echo

    computation=()

done < <(tail -n +2 $1)

declare-A coltofield
headerdone=0
#获取输入文件的第一行并提取所有字段
#以及他们的立场。从位置值2开始，因为
#以下行的格式
读行时；做
colnum=$（echo$行| cut-d“=”-f1）
字段=$（回显$行|剪切-d“=”-f 2）
coltofield[$colnum]=$field
完成<这不是一行。您可以使用bashv4、Bash的dictonary和一些shell工具来实现这一点
使用文件名执行下面的脚本以处理参数
bash script_below.sh your_file

以下是脚本：
declare -A coltofield
headerdone=0

# Take the first line of the input file and extract all fields 
# and their position. Start with position value 2 because of the 
# format of the following lines

while read line; do
    colnum=$(echo $line | cut -d "=" -f 1)
    field=$(echo $line | cut -d "=" -f 2)

    coltofield[$colnum]=$field
done < <(head -n 1 $1 | sed  -e 's/^[[:space:]]*//;' -e 's/[[:space:]]*$//;' -e 's/[[:space:]]\+/\n/g;' | nl -v 2 -n ln  | sed -e 's/[[:space:]]\+/=/g;')

# Read the rest of the file starting with the second line             
while read line; do
    declare -A computation
    declare varname


    # Turn the line in key value pair. The key is the position of 
    # the value in the line
    while read value; do
        vcolnum=$(echo $value | cut -d "=" -f 1)
        vvalue=$(echo $value | cut -d "=" -f 2)

        # The first value is the line variable name 
        # (s1, s2)                                       
        if [[ $vcolnum == "1" ]]; then
            varname=$vvalue
            continue
        fi

        # Get the name of the field by the column 
        # position                                                     
        field=${coltofield[$vcolnum]}

        # Add the value to the current sum for this field
        computation[$field]=$((computation[$field]+${vvalue}))
    done < <(echo $line | sed  -e 's/^[[:space:]]*//;' -e 's/[[:space:]]*$//;' -e 's/[[:space:]]\+/\n/g;' | nl -n ln  | sed -e 's/[[:space:]]\+/=/g;')


    if [[ $headerdone == "0" ]]; then
        echo -e -n "\t"
        for key in ${!computation[@]}; do echo -n -e "$key\t" ; done; echo
        headerdone=1
    fi

    echo -n -e "$varname\t"
    for value in ${computation[@]}; do echo -n -e "$value\t"; done; echo

    computation=()

done < <(tail -n +2 $1)

declare-A coltofield
headerdone=0
#获取输入文件的第一行并提取所有字段
#以及他们的立场。从位置值2开始，因为
#以下行的格式
读行时；做
colnum=$（echo$行| cut-d“=”-f1）
字段=$（回显$行|剪切-d“=”-f 2）
coltofield[$colnum]=$field
完成<$cat文件
x y x y
中一3 4 6 10
s2 3 9 10 7
S37 1 3 2
$cat tst.awk
NR==1{
对于（i=1；i$cat文件
x y x y
中一3 4 6 10
s2 3 9 10 7
S37 1 3 2
$cat tst.awk
NR==1{
对于（i=1；i另一个AWK备选方案：
$ cat f
   x y  x  y
s1 3 4  6 10
s2 3 9 10  7
s3 7 1  3  2

$ cat f.awk
BEGIN {
OFS="\t";
}

NR==1 {
  #need header for 1st column
  for(f=NF; f>=1; --f)
    $(f+1) = $f;
  $1="";

  for(f=1; f<=NF; ++f)
    fld2hdr[f]=$f;
}

{
  for(f=1; f<=NF; ++f)
    if($f ~ /^[0-9]/)
      colValues[fld2hdr[f]]+=$f;
    else
      colValues[fld2hdr[f]]=$f;

  for (i in colValues)
    row = row colValues[i] OFS;
  print row;

  split("", colValues);
  row=""
}

$ awk -f f.awk f
        x       y
s1      9       14
s2      13      16
s3      10      3

$cat f
x y x y
中一3 4 6 10
s2 3 9 10 7
S37 1 3 2
$cat f.awk
开始{
OFS=“\t”；
}
NR==1{
#第一列需要标题
对于（f=NF；f>=1；--f）
$（f+1）=$f；
$1="";
对于（f=1；f另一个AWK备选方案：
$ cat f
   x y  x  y
s1 3 4  6 10
s2 3 9 10  7
s3 7 1  3  2

$ cat f.awk
BEGIN {
OFS="\t";
}

NR==1 {
  #need header for 1st column
  for(f=NF; f>=1; --f)
    $(f+1) = $f;
  $1="";

  for(f=1; f<=NF; ++f)
    fld2hdr[f]=$f;
}

{
  for(f=1; f<=NF; ++f)
    if($f ~ /^[0-9]/)
      colValues[fld2hdr[f]]+=$f;
    else
      colValues[fld2hdr[f]]=$f;

  for (i in colValues)
    row = row colValues[i] OFS;
  print row;

  split("", colValues);
  row=""
}

$ awk -f f.awk f
        x       y
s1      9       14
s2      13      16
s3      10      3

$cat f
x y x y
中一3 4 6 10
s2 3 9 10 7
S37 1 3 2
$cat f.awk
开始{
OFS=“\t”；
}
NR==1{
#第一列需要标题
对于（f=NF；f>=1；--f）
$（f+1）=$f；
$1="";
对于（f=1；f这里有一个Perl解决方案，只是为了好玩：
cat table.txt | perl -e'@h=grep{$_}split/\s+/,<>;while(@l=grep{$_}split/\s+/,<>){for$i(1..$#l){$t{$l[0]}{$h[$i-1]}+=$l[$i]}};printf "    %s\n",(join"  ",sort keys%{$t{(keys%t)[0]}});for$h(sort keys%t){printf"$h %s\n",(join " ",map{sprintf"%2d",$_}@{$t{$h}}{sort keys%{$t{$h}}})};'

[代码>cat table.txt[代码>cat table.txt[代码]第12号，perl-e的一名名名为“、一名名名为“、一名名名为“代码”的猫表.txt”的一名名名为“、一名名名为“、一名名为”的perl-e”的第二名为“、一名为“、一名为，”而另一名为“、一名为“、一名为“、一名为”名为“、一名为“、一名为”名为“、一名为“、一名为”名为“、一名为“、一名为”名为“、一名为“、1.1.1.5.5.3.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5”的第二名为”的第二名为h}{排序键%{$t{$h}}}；'
这里有一个Perl解决方案，只是为了好玩：
cat table.txt | perl -e'@h=grep{$_}split/\s+/,<>;while(@l=grep{$_}split/\s+/,<>){for$i(1..$#l){$t{$l[0]}{$h[$i-1]}+=$l[$i]}};printf "    %s\n",(join"  ",sort keys%{$t{(keys%t)[0]}});for$h(sort keys%t){printf"$h %s\n",(join " ",map{sprintf"%2d",$_}@{$t{$h}}{sort keys%{$t{$h}}})};'

[代码>cat table.txt[代码>cat table.txt[代码]第12号，perl-e的一名名名为“、一名名名为“、一名名名为“代码”的猫表.txt”的一名名名为“、一名名名为“、一名名为”的perl-e”的第二名为“、一名为“、一名为，”而另一名为“、一名为“、一名为“、一名为”名为“、一名为“、一名为”名为“、一名为“、一名为”名为“、一名为“、一名为”名为“、一名为“、1.1.1.5.5.3.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5.5”的第二名为”的第二名为h}{排序键%{$t{$h}}}；'
重复标题是否随机定位或始终类似于“x y x y”？重复标题是否随机定位或始终类似于“x y x y”？上面的方法效率很低，对于各种输入文件内容和名称都会以令人惊讶的方式失败，并且不必要地依赖GNU工具。评论中列出的问题太多了。如果你能提供一个可能错误的非常不精确的列表，而不是贬低另一个用户的答案，那将更有帮助。我没有贬低它，我确实提供了一个可能错误的不精确列表（效率低下，对于各种输入文件内容和名称都会失败）。正如我所说，它确实有太多的问题，无法在评论中具体列出。如果还有十几个问题，那么指出一两个问题的例子又有什么意义呢？我不想在脚本只需要重新编写时就每个问题进行长串的评论或其他讨论。以防这样做会有所帮助显示我的一个关注点，下面是在我发布的awk脚本上运行time
的输出：$time awk-f tst.awk文件
=real 0m0.078s用户0m0.030s sys 0m0.077s
与上面的shell脚本相比：$time./tst.sh文件
=real 0m3.766s用户0m3.139s sys 0m3.350s
。因此，shell脚本是2个数量级幅度比必要的慢。加上由于未引用变量、错误使用读取、使用外部命令等导致的所有潜在问题。我希望您能理解为什么我认为列出所有问题不可行或不值得。当然，纯Awk解决方案比shell脚本快。但为什么使用读取不正确ct？从您的角度来看，我如何改进它？顺便说一句，问题不是提供最快的解决方案，而且对于shell脚本来说，使用extnal命令是否合适。当然，对于便携式解决方案，我会选择不同的方法。上述方法效率非常低，对于各种输入文件内容和名称，都会以令人惊讶的方式失败d不必要地依赖GNU工具。有太多的问题需要在评论中列出。如果你能提供一个可能错误的非常不精确的列表，而不是贬低另一个用户的答案，那将更有帮助。我没有贬低它，我确实提供了一个可能错误的不精确列表（效率低下，对于各种输入文件内容和名称都会失败）。正如我所说，它确实有太多的问题，无法在评论中具体列出。如果还有十几个问题，那么指出一两个问题的例子又有什么意义呢？我不想在脚本只需要重新编写时就每个问题进行长串的评论或其他讨论。以防这样做会有所帮助显示我的一个关注点，下面是在我发布的awk脚本上运行time
的输出：$time awk-f tst.awk文件
=real 0m0.078s用户0m0.030s sys 0m0.077s
与上面的shell脚本相比：$time./tst.sh文件
=real 0m3.766s用户0m3.139s sys 0m3.350s
。因此，shell脚本是2个数量级震级比需要的慢。加上所有的电位