Linux 在UNIX中创建透视表

Linux 在UNIX中创建透视表,linux,shell,unix,awk,Linux,Shell,Unix,Awk,下面是我的输入数据,我正在尝试创建一个透视表 input.txt ID,CreateDate,Category,Region,PublishDate,Code,Listing,Type,ModifiedDate FRU426131598,22-Aug-16,SELLING,COUNTRY,22-Aug-16,1,SAMPLE,GRAPE,22-Aug-16 FRU426175576,23-Aug-16,SELLING,COUNTRY,23-Aug-16,1,SAMPLE,APPLE,23-Aug

下面是我的输入数据,我正在尝试创建一个透视表

input.txt

ID,CreateDate,Category,Region,PublishDate,Code,Listing,Type,ModifiedDate
FRU426131598,22-Aug-16,SELLING,COUNTRY,22-Aug-16,1,SAMPLE,GRAPE,22-Aug-16
FRU426175576,23-Aug-16,SELLING,COUNTRY,23-Aug-16,1,SAMPLE,APPLE,23-Aug-16
FRU427163049,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,APPLE,26-Aug-16
FRU427163049,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,APPLE,26-Aug-16
FRU427163049,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,GRAPE,26-Aug-16
FRU427163049,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,GRAPE,26-Aug-16
FRU427163049,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,APPLE,26-Aug-16
FRU427163049,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,APPLE,26-Aug-16
FRU426972836,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,ORANGE,26-Aug-16
FRU427322180,28-Aug-16,SELLING,COUNTRY,28-Aug-16,1,SAMPLE,GRAPE,28-Aug-16
FRU427032658,26-Aug-16,SELLING,COUNTRY,26-Aug-16,1,SAMPLE,APPLE,26-Aug-16
FRU427373494,29-Aug-16,SELLING,COUNTRY,29-Aug-16,1,SAMPLE,GRAPE,29-Aug-16
FRU427373069,29-Aug-16,SELLING,COUNTRY,29-Aug-16,1,SAMPLE,GRAPE,29-Aug-16
FRU425669484,19-Aug-16,SELLING,COUNTRY,19-Aug-16,1,SAMPLE,APPLE,19-Aug-16
FRU425616815,18-Aug-16,SELLING,COUNTRY,18-Aug-16,1,SAMPLE,APPLE,18-Aug-16
FRU420018273,25-Sep-16,SELLING,COUNTRY,25-Sep-16,1,SAMPLE,ORANGE,25-Sep-16
FRU435018589,25-Sep-16,SELLING,COUNTRY,25-Sep-16,1,SAMPLE,ORANGE,25-Sep-16
FRU421375128,26-Sep-16,SELLING,COUNTRY,26-Sep-16,1,SAMPLE,APPLE,26-Sep-16
FRU434911933,21-Sep-16,SELLING,COUNTRY,21-Sep-16,1,SAMPLE,ORANGE,21-Sep-16
FRU434594125,21-Sep-16,SELLING,COUNTRY,21-Sep-16,1,SAMPLE,ORANGE,21-Sep-16
键入作为行归档,createDate作为列归档。和ID字段的值之和

期望输出:

Row Labels  18-Aug-16   19-Aug-16   22-Aug-16   23-Aug-16   26-Aug-16   28-Aug-16   29-Aug-16   21-Sep-16   25-Sep-16   26-Sep-16   Grand Total
APPLE   1   1       1   5                   1   9
GRAPE           1       2   1   2               6
ORANGE                  1           2   2       5
Grand Total 1   1   1   1   8   1   2   2   2   1   20

有什么办法吗?我可以使用awk获取createdDate的计数。但无法创建包含行和列的数据透视表。

awk
请救命

这可以让你开始

$ awk -F, -v OFS='\t' 'NR>1 {k=$(NF-1); d=$2; keys[k]; dates[d]; a[k,d]++}
                        END {line="Row Labels"; 
                             for(d in dates) line = line OFS d; 
                             print line; 
                             for(k in keys) 
                               {{line=k; 
                                 for(d in dates) line=line OFS a[k,d]} 
                                print line}}' file    

Row Labels      19-Aug-16       29-Aug-16       23-Aug-16       18-Aug-16       28-Aug-16       22-Aug-16       26-Aug-16       26-Sep-16  21-Sep-16       25-Sep-16
APPLE   1               1       1                       5       1
ORANGE                                                  1               2       2
GRAPE           2                       1       1       2

您可能希望对日期进行排序(不是那么容易),并可以添加总计(很容易)。

awk
到rescue

这可以让你开始

$ awk -F, -v OFS='\t' 'NR>1 {k=$(NF-1); d=$2; keys[k]; dates[d]; a[k,d]++}
                        END {line="Row Labels"; 
                             for(d in dates) line = line OFS d; 
                             print line; 
                             for(k in keys) 
                               {{line=k; 
                                 for(d in dates) line=line OFS a[k,d]} 
                                print line}}' file    

Row Labels      19-Aug-16       29-Aug-16       23-Aug-16       18-Aug-16       28-Aug-16       22-Aug-16       26-Aug-16       26-Sep-16  21-Sep-16       25-Sep-16
APPLE   1               1       1                       5       1
ORANGE                                                  1               2       2
GRAPE           2                       1       1       2

您可能希望对日期进行排序(不是那么容易),并可以添加总计(很容易)。

这里有一种对日期进行排序的方法。需要GNU awk

awk -F, '
    function date2epoch(date,    arr,mon) {
        split(date, arr, /-/)
        mon = (index("JanFebMarAprMayJunJulAugSepOctNovDec", arr[2]) - 1) / 3 + 1
        return mktime("20" arr[3] " " mon " " arr[1] " 0 0 0")
    }
    NR > 1 {
        d = date2epoch($NF)
        dates[d]
        count[$(NF-1)][d]++
        total[d]++
    } 
    END {
        PROCINFO["sorted_in"] = "@ind_str_asc"

        printf "Row Label"
        for (d in dates) 
            printf "\t%s", strftime("%d-%b-%y", d)
        print ""

        for (type in count) {
            printf "%s", type
            for (d in dates) 
                printf "\t%s", count[type][d]
            print ""
        }

        printf "Total"
        for (d in dates) 
            printf "\t%s", total[d]
        print ""
    }
' file

这里有一个排序日期的方法。需要GNU awk

awk -F, '
    function date2epoch(date,    arr,mon) {
        split(date, arr, /-/)
        mon = (index("JanFebMarAprMayJunJulAugSepOctNovDec", arr[2]) - 1) / 3 + 1
        return mktime("20" arr[3] " " mon " " arr[1] " 0 0 0")
    }
    NR > 1 {
        d = date2epoch($NF)
        dates[d]
        count[$(NF-1)][d]++
        total[d]++
    } 
    END {
        PROCINFO["sorted_in"] = "@ind_str_asc"

        printf "Row Label"
        for (d in dates) 
            printf "\t%s", strftime("%d-%b-%y", d)
        print ""

        for (type in count) {
            printf "%s", type
            for (d in dates) 
                printf "\t%s", count[type][d]
            print ""
        }

        printf "Total"
        for (d in dates) 
            printf "\t%s", total[d]
        print ""
    }
' file

使用GNU awk 4.*对于真正的多维数组和排序的数组:

$ cat tst.awk
BEGIN { FS=","; OFS="\t" }
NR>1 {
    split($2,t,/-/)
    date = sprintf("%02d%02d%02d",t[3],(match("JanFebMarAprMayJunJulAugSepOctNovDec",t[2])+2)/3,t[1])
    dateNames[date] = $2
    fruitCnts[$8][date]++
}
END {
    PROCINFO["sorted_in"] = "@ind_str_asc"

    printf "%s%s", "Row Labels", OFS
    for (date in dateNames) {
        printf "%s%s", dateNames[date], OFS
    }
    print "Grand Total"

    for (fruit in fruitCnts) {
        fruitTotal = 0
        printf "%s%s", fruit, OFS
        for (date in dateNames) {
            cnt = (date in fruitCnts[fruit] ? fruitCnts[fruit][date] : "")
            printf "%s%s", cnt, OFS
            dateTotals[date] += cnt
            fruitTotal += cnt
        }
        print fruitTotal
    }

    printf "%s%s", "Grand Total", OFS
    for (date in dateNames) {
        printf "%s%s", dateTotals[date], OFS
        total += dateTotals[date]
    }
    print total
}


使用GNU awk 4.*对于真正的多维数组和排序的数组:

$ cat tst.awk
BEGIN { FS=","; OFS="\t" }
NR>1 {
    split($2,t,/-/)
    date = sprintf("%02d%02d%02d",t[3],(match("JanFebMarAprMayJunJulAugSepOctNovDec",t[2])+2)/3,t[1])
    dateNames[date] = $2
    fruitCnts[$8][date]++
}
END {
    PROCINFO["sorted_in"] = "@ind_str_asc"

    printf "%s%s", "Row Labels", OFS
    for (date in dateNames) {
        printf "%s%s", dateNames[date], OFS
    }
    print "Grand Total"

    for (fruit in fruitCnts) {
        fruitTotal = 0
        printf "%s%s", fruit, OFS
        for (date in dateNames) {
            cnt = (date in fruitCnts[fruit] ? fruitCnts[fruit][date] : "")
            printf "%s%s", cnt, OFS
            dateTotals[date] += cnt
            fruitTotal += cnt
        }
        print fruitTotal
    }

    printf "%s%s", "Grand Total", OFS
    for (date in dateNames) {
        printf "%s%s", dateTotals[date], OFS
        total += dateTotals[date]
    }
    print total
}