Bash Awk计算平均值忽略异常值-对于分段文件

Bash Awk计算平均值忽略异常值-对于分段文件,bash,awk,average,outliers,Bash,Awk,Average,Outliers,我的数据文件data.txt如下所示 0.01667 20.53 0.01667 6.35 0.01667 6.94 0.01667 7.07 0.01667 8.06 0.01667 8.10 0.01667 8.25 0.01667 8.71 0.01667 9.31 0.02500 20.19 0.02500 6.35 0.02500 6.92 0.02500 7.07 0.02500 8.08 0.02500 8.09 0.02500 8.24 0.02500 8.70 0.02500

我的数据文件data.txt如下所示

0.01667 20.53
0.01667 6.35
0.01667 6.94
0.01667 7.07
0.01667 8.06
0.01667 8.10
0.01667 8.25
0.01667 8.71
0.01667 9.31
0.02500 20.19
0.02500 6.35
0.02500 6.92
0.02500 7.07
0.02500 8.08
0.02500 8.09
0.02500 8.24
0.02500 8.70
0.02500 9.26
0.03333 19.89
0.03333 6.33
0.03333 6.90
0.03333 7.07
0.03333 8.07
0.03333 8.09
0.03333 8.22
0.03333 8.70
0.03333 9.22
0.04167 19.65
0.04167 6.34
0.04167 6.87
0.04167 7.07
0.04167 8.03
0.04167 8.08
0.04167 8.19
0.04167 8.69
0.04167 9.19
0.05000 19.40
0.05000 6.32
0.05000 6.85
0.05000 7.06
0.05000 8.02
0.05000 8.09
0.05000 8.16
0.05000 8.71
0.05000 9.15
0.05833 19.12
0.05833 6.29
0.05833 6.84
0.05833 7.04
0.05833 8.01
0.05833 8.11
0.05833 8.16
0.05833 8.71
0.05833 9.11
0.06667 18.84
0.06667 6.29
0.06667 6.82
0.06667 7.05
0.06667 7.98
0.06667 8.11
0.06667 8.14
0.06667 8.71
0.06667 9.06
0.07500 18.57
0.07500 6.29
0.07500 6.80
0.07500 7.06
0.07500 7.97
0.07500 8.10
0.07500 8.13
0.07500 8.71
0.07500 9.02
第1列是进行第2列中测量的时间。我需要为第1列中给出的每次计算第2列中的值的平均值,并输出该时间的值和该时间的平均值。我能够使用以下awk代码进行AVERGE

awk '{if($1<0)$1=0}
    {
        sum[$1]+=$2
        cnt[$1]++
    }
    END {
    #     print "Name" "\t" "sum" "\t" "cnt" "\t" "avg"
        for (i in sum)
            printf "%8.5f   %6.2f   %6d   %6.3f\n", i, sum[i], cnt[i], sum[i]/cnt[i]

    }' data.txt  | sort -n -k1 > avgFile.txt
请注意,我还输出了一些其他内容,以便检查我是否做了正确的事情。正如您可以看到的,每个时隙的数据都包含异常值,我需要删除这些异常值。我已经尝试将在0.01667收集的数据选择到某个文件temp.txt,我有以下awk代码可以正确地删除异常值

awk 'BEGIN{CNT=0} {ROW[CNT]=$0;DATA[CNT]=$2; 
    TOTAL+=$2;CNT+=1;} END{for (i = 0;i < NR; i++){if ((sqrt((DATA[i]-(TOTAL/NR))^2))<((TOTAL/NR)*30/100)) 
    {print ROW[i] ;}}}' temp.txt
但我需要在原始代码中这样做,以便在计算第2列中的值的平均值之前,每次都删除一个异常值


非常感谢您的帮助。

这将计算平均值,然后删除异常值,然后在删除异常值后重新计算平均值:

$ cat tst.awk
{
    vals[$1][$2]
    sum[$1] += $2
    cnt[$1]++
}

END {
    div = 0.3
    for (time in vals) {
        ave  = sum[time] / cnt[time]
        low  = ave * (1 - div)
        high = ave * (1 + div)
        for (val in vals[time]) {
            if ( (val < low) || (val > high) ) {
                print "Deleting outlier", time, val | "cat>&2"
                sum[time] -= val
                cnt[time]--
            }
        }
    }

    for (time in vals) {
        ave = (cnt[time] > 0 ? sum[time] / cnt[time] : 0)
        print time, sum[time], cnt[time], ave
    }
}

这就是你要找的吗?它将GNU awk用于真正的二维数组。

好的,我告诉过你,当我有时间时,我会编写一个快速脚本,结果不是那么快。这会删除异常值并返回清理后数组的平均值。如果需要,可以实现标准偏差。如果您有问题,请告诉我

#!/bin/bash

## generic error/usage function
function usage {
    local ecode=${2:-0}
    test -n "$1" && printf "\n %s\n" "$1" >&2
cat >&2 << helpMessage

usage:  ${0//*\//} datafile

This script will process a 2-column datafile to provide average, 
mean and std. deviation for each time group of data while removing
outlying data from the calculation. The datafile format:

    time    value
    0.01667 20.53  <- outlier
    0.01667 6.35
    0.01667 6.94
    ...

Options:

    -h  |  --help  program help (this file)

helpMessage

    exit $ecode;
}

## function to calculate average of arguments
function average {

    local sum=0
    declare -i count=0
    for n in $@; do
        sum=$( printf "scale=6; %s+%s\n" "$sum" "$n" | bc )
        ((count++))
    done
    avg=$( printf "scale=6; %s/%s\n" "$sum" "$count" | bc )
    printf "%s\n" "$avg"
}

## function to examine arguments a remove any outlier
#  that is greater than 4 from the average.
#  values without the outlier are returned to command line
function rmoutlier {

    local avg=$(average $@)
    local diff=0
    for i in $@; do
        diff=$( printf "scale=6; %s-%s\n" "$i" "$avg" | bc )
        [ "${diff:0:1}" = '-' ]  && diff="${diff:1}"            # quick absolute value hack
        [ "${diff:0:1}" = '.' ]  && diff=0                      # set any fractional 0
        if [ $((${diff//.*/})) -lt 4 ]; then
            clean+=( $i )                                       # if whole num diff < 4, keep
        else
            echo "->outlier: $i" >&2                            # print outlier to stderr
        fi
    done
    echo ${clean[@]}                                            # return array
}

## respond to -h or --help
test "${1:1}" = 'h' || test "${1:2}" = 'help' && usage

## set variables
dfn="${1:-dat/outlier.dat}"     # datafile (default dat/outlier.dat)
declare -a tmp                  # temporary array holding data for given time
ptime=0                         # variable holding previous time (flag for 1st line)

## validate input filename
test -r "$dfn" || usage "Error: invalid input. File '$dfn' not found" 1

while read -r time data || [ -n "$data" ]; do               # read all lines of data

    if [ "$ptime" = 0 ] || [ "$ptime" = "$time" ]; then     # if no change in time
        tmp+=( $data )                                      # fill array with data
    else
        echo "  time: $ptime  data : '${tmp[@]}'" >&2       # output array to stderr

        ## process data
        clean=( $(rmoutlier ${tmp[@]} ) )                   # remove outlier
        echo  "  time: $ptime  clean: '${clean[@]}'" >&2    # output clean array
        avgclean=$( average ${clean[@]} )                   # average clean array
        printf "  avgclean: %s\n\n" "$avgclean" >&2         # output avg of clean array

        unset tmp           # reset variables for next time
        unset clean
        unset avgclean

        tmp+=( $data )      # read first value for next time set
    fi

    ptime="$time"           # save previous time for comparison

done <"$dfn"

## process final time block

echo "  time: $ptime  data : '${tmp[@]}'" >&2

## process data
clean=( $(rmoutlier ${tmp[@]} ) )
echo  "  time: $ptime  clean: '${clean[@]}'" >&2
avgclean=$( average ${clean[@]} )
printf "  avgclean: %s\n\n" "$avgclean" >&2

unset tmp
unset clean
unset avgclean

exit 0
输出:

附录:文件的写入时间和平均值

下面是脚本的更新部分,它将把时间和清洁平均值输出到文件默认值:dat/outlier.out。只有包含ofn的“输出文件名”的行已更改。您可以将想要的任何输出文件名作为第二个参数传递给脚本,因此新用法:将是:outlier.sh input_file output_file:


您的问题不在于awk,而是在读取数据时需要检测异常值。例如,如何将第一个点检测为异常值?这不是微不足道的,例如,谢谢你的评论本杰明,在这种情况下,我使用标准偏差,我认为数据中的值超过平均值的30%,这是一个离群值,我认为它足以从平均计算中删除这样一个值。感谢链接到这项研究,我还不能下载完整的手稿。一旦我有了全文,我就会读它。你可以用简单的方法进行实验。就像在一个数组中读取特定时间的所有值,对数据求平均值,然后计算每个点的偏差,去掉任何大于某个阈值的值。例如.01667,平均值为9.25。如果将阈值设置为4,则可以消除异常值。同样的逻辑似乎适用于所有异常值。嗨,大卫,我一直在尝试做一些类似的事情,正如你在我的第二段代码中看到的那样,我在其中使用了一个事实,即如果某个特定值与平均值的偏差超过30%,我将其删除。我现在面临的问题是如何在我的代码中实现这一点,我在代码中选择特定时间的数据,然后删除异常值,最后计算平均值。我的awk技能仍处于初级阶段infancy@malandisa我不是awk专家,但一个简单的bash脚本可以轻松处理它。只需使用while循环,同时向tmp数组中添加值,处理数组以删除异常值,然后获取剩余值的平均值、平均值和标准偏差,取消设置数组并移动到下一次。如果有机会,我会写一个例子。Ed!!非常感谢。这很好用。事实上,这正是我所需要的,我从中学到了很多。谢谢你不客气,如果这是你想要的答案,请点击旁边的复选标记来表示。我真的需要更多地了解awk。那就简单多了。然而,我不知道你在哪里显示没有异常值的平均值?也许我只是误读了这个问题。比如时间0.01667,我得到的平均值没有7.84875的异常值。我也通过手动计算得到了同样的结果?行打印时间,sum[time],cnt[time],ave在异常值被sum[time]=val移除后打印平均值,值的计数被cnt[time]=val递减。我的计算发现0.01667、6.35和20.53存在两个异常值,因为所有值的原始平均值为9.25778,因此这两个值分别小于该值的70%和大于该值的130%。因此,如果您的计算仅发现一个异常值,那么这将解释最终结果不同的原因。OP没有说明识别异常值的算法,所以我做了一个猜测。OP在评论中说,如果某个特定值与平均值的偏差超过30%,我将其删除。因此,当我在计算div=0.3时使用30%时,我并不是凭空得出的,我只是不确定我使用它的方式是否正是他们想要的。大卫,谢谢你的脚本。现在我想知道如何将时间和平均时间输出到文件中,而不是将所有内容打印到scr中
伊恩。这对我学习bash脚本很有帮助,但我花了很长时间,但仍然无法正确完成。谢谢David。现在它可以正常输出到一个文件。不过,唯一的缺点是速度相当慢。对于一个大约1010行的文件,处理大约需要2分钟,这相当慢。有没有一种方法可以使它更快,或者这是它能做的最好的?请在此文件上测试它。。。应该以某种方式将-24.000转换为0.00。
#!/bin/bash

## generic error/usage function
function usage {
    local ecode=${2:-0}
    test -n "$1" && printf "\n %s\n" "$1" >&2
cat >&2 << helpMessage

usage:  ${0//*\//} datafile

This script will process a 2-column datafile to provide average, 
mean and std. deviation for each time group of data while removing
outlying data from the calculation. The datafile format:

    time    value
    0.01667 20.53  <- outlier
    0.01667 6.35
    0.01667 6.94
    ...

Options:

    -h  |  --help  program help (this file)

helpMessage

    exit $ecode;
}

## function to calculate average of arguments
function average {

    local sum=0
    declare -i count=0
    for n in $@; do
        sum=$( printf "scale=6; %s+%s\n" "$sum" "$n" | bc )
        ((count++))
    done
    avg=$( printf "scale=6; %s/%s\n" "$sum" "$count" | bc )
    printf "%s\n" "$avg"
}

## function to examine arguments a remove any outlier
#  that is greater than 4 from the average.
#  values without the outlier are returned to command line
function rmoutlier {

    local avg=$(average $@)
    local diff=0
    for i in $@; do
        diff=$( printf "scale=6; %s-%s\n" "$i" "$avg" | bc )
        [ "${diff:0:1}" = '-' ]  && diff="${diff:1}"            # quick absolute value hack
        [ "${diff:0:1}" = '.' ]  && diff=0                      # set any fractional 0
        if [ $((${diff//.*/})) -lt 4 ]; then
            clean+=( $i )                                       # if whole num diff < 4, keep
        else
            echo "->outlier: $i" >&2                            # print outlier to stderr
        fi
    done
    echo ${clean[@]}                                            # return array
}

## respond to -h or --help
test "${1:1}" = 'h' || test "${1:2}" = 'help' && usage

## set variables
dfn="${1:-dat/outlier.dat}"     # datafile (default dat/outlier.dat)
declare -a tmp                  # temporary array holding data for given time
ptime=0                         # variable holding previous time (flag for 1st line)

## validate input filename
test -r "$dfn" || usage "Error: invalid input. File '$dfn' not found" 1

while read -r time data || [ -n "$data" ]; do               # read all lines of data

    if [ "$ptime" = 0 ] || [ "$ptime" = "$time" ]; then     # if no change in time
        tmp+=( $data )                                      # fill array with data
    else
        echo "  time: $ptime  data : '${tmp[@]}'" >&2       # output array to stderr

        ## process data
        clean=( $(rmoutlier ${tmp[@]} ) )                   # remove outlier
        echo  "  time: $ptime  clean: '${clean[@]}'" >&2    # output clean array
        avgclean=$( average ${clean[@]} )                   # average clean array
        printf "  avgclean: %s\n\n" "$avgclean" >&2         # output avg of clean array

        unset tmp           # reset variables for next time
        unset clean
        unset avgclean

        tmp+=( $data )      # read first value for next time set
    fi

    ptime="$time"           # save previous time for comparison

done <"$dfn"

## process final time block

echo "  time: $ptime  data : '${tmp[@]}'" >&2

## process data
clean=( $(rmoutlier ${tmp[@]} ) )
echo  "  time: $ptime  clean: '${clean[@]}'" >&2
avgclean=$( average ${clean[@]} )
printf "  avgclean: %s\n\n" "$avgclean" >&2

unset tmp
unset clean
unset avgclean

exit 0
./outlier.sh  datafile
$ ./outlier.sh dat/outlier.dat
  time: 0.01667  data : '20.53 6.35 6.94 7.07 8.06 8.10 8.25 8.71 9.31'
->outlier: 20.53
  time: 0.01667  clean: '6.35 6.94 7.07 8.06 8.10 8.25 8.71 9.31'
  avgclean: 7.848750

  time: 0.02500  data : '20.19 6.35 6.92 7.07 8.08 8.09 8.24 8.70 9.26'
->outlier: 20.19
  time: 0.02500  clean: '6.35 6.92 7.07 8.08 8.09 8.24 8.70 9.26'
  avgclean: 7.838750

  time: 0.03333  data : '19.89 6.33 6.90 7.07 8.07 8.09 8.22 8.70 9.22'
->outlier: 19.89
  time: 0.03333  clean: '6.33 6.90 7.07 8.07 8.09 8.22 8.70 9.22'
  avgclean: 7.825000

  time: 0.04167  data : '19.65 6.34 6.87 7.07 8.03 8.08 8.19 8.69 9.19'
->outlier: 19.65
  time: 0.04167  clean: '6.34 6.87 7.07 8.03 8.08 8.19 8.69 9.19'
  avgclean: 7.807500

  time: 0.05000  data : '19.40 6.32 6.85 7.06 8.02 8.09 8.16 8.71 9.15'
->outlier: 19.40
  time: 0.05000  clean: '6.32 6.85 7.06 8.02 8.09 8.16 8.71 9.15'
  avgclean: 7.795000

  time: 0.05833  data : '19.12 6.29 6.84 7.04 8.01 8.11 8.16 8.71 9.11'
->outlier: 19.12
  time: 0.05833  clean: '6.29 6.84 7.04 8.01 8.11 8.16 8.71 9.11'
  avgclean: 7.783750

  time: 0.06667  data : '18.84 6.29 6.82 7.05 7.98 8.11 8.14 8.71 9.06'
->outlier: 18.84
  time: 0.06667  clean: '6.29 6.82 7.05 7.98 8.11 8.14 8.71 9.06'
  avgclean: 7.770000

  time: 0.07500  data : '18.57 6.29 6.80 7.06 7.97 8.10 8.13 8.71 9.02'
->outlier: 18.57
  time: 0.07500  clean: '6.29 6.80 7.06 7.97 8.10 8.13 8.71 9.02'
  avgclean: 7.760000
## set variables
dfn="${1:-dat/outlier.dat}"     # datafile (default dat/outlier.dat)
ofn="${2:-dat/outlier.out}"     # output file (default dat/outlier.out)
declare -a tmp                  # temporary array holding data for given time
ptime=0                         # variable holding previous time (flag for 1st line)

:> "$ofn"                       # truncate output file

## validate input filename
test -r "$dfn" || usage "Error: invalid input. File '$dfn' not found" 1

while read -r time data || [ -n "$data" ]; do               # read all lines of data

    if [ "$ptime" = 0 ] || [ "$ptime" = "$time" ]; then     # if no change in time
        tmp+=( $data )                                      # fill array with data
    else
        echo "  time: $ptime  data : '${tmp[@]}'" >&2       # output array to stderr
        printf "  time: %s  " "$ptime" >>"$ofn"             # output array to file

        ## process data
        clean=( $(rmoutlier ${tmp[@]} ) )                   # remove outlier
        echo  "time: $ptime  clean: '${clean[@]}'" >&2      # output clean array
        avgclean=$( average ${clean[@]} )                   # average clean array
        printf "  avgclean: %s\n\n" "$avgclean" >&2         # output avg of clean array
        printf "  avgclean: %s\n" "$avgclean" >>"$ofn"      # output avg of clean array to file

        unset tmp           # reset variables for next time
        unset clean
        unset avgclean

        tmp+=( $data )      # read first value for next time set
    fi

    ptime="$time"           # save previous time for comparison

done <"$dfn"
time: 0.01667    avgclean: 7.848750
time: 0.02500    avgclean: 7.838750
time: 0.03333    avgclean: 7.825000
time: 0.04167    avgclean: 7.807500
time: 0.05000    avgclean: 7.795000
time: 0.05833    avgclean: 7.783750
time: 0.06667    avgclean: 7.770000