tidyverse概念的awk等价物(熔化和扩散)

tidyverse概念的awk等价物(熔化和扩散),awk,text-processing,Awk,Text Processing,我有一些文本日志需要解析并格式化为CSV。 我有一个可以工作的R脚本,但一旦文件大小增加,它就会变慢,据我所知,这个问题似乎是使用awk(或其他命令行工具?)加速的一个很好的候选 我在awk方面做得不多,我面临的问题是如何将我对R中处理的想法转换为awk脚本编写的方式 示例截断的输入数据(Scrap.log): 预期输出(截断): 我在R脚本中的一般步骤: 在文件顶部添加具有新名称的单个标题行 每行的最上面一行(以!!G开头) 标题列(_START)的格式从宽到长 到目前为止,我在awk工作的作

我有一些文本日志需要解析并格式化为CSV。 我有一个可以工作的R脚本,但一旦文件大小增加,它就会变慢,据我所知,这个问题似乎是使用awk(或其他命令行工具?)加速的一个很好的候选

我在
awk
方面做得不多,我面临的问题是如何将我对R中处理的想法转换为
awk
脚本编写的方式

示例截断的输入数据(Scrap.log): 预期输出(截断): 我在R脚本中的一般步骤:

  • 在文件顶部添加具有新名称的单个标题行
  • 每行的最上面一行(以!!G开头)
  • 标题列(_START)的格式从宽到长
  • 到目前为止,我在awk工作的作品包括:
  • 如何抓取和打印标题行
  • awk'/\u START/{header=$0;print header}'废料.log

  • 如何使用新的标题值写入单行
  • awk'BEGIN{ORS=”“;for(counter=1;counter我认为:

    awk '
    BEGIN{
        # output the header line
        print "HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value"
    }
    # ignore comment lines
    /;/{next}
    
    /!!G/{
        valcnt = 1
        # save and shuffle the values
        val[valcnt++] = $2
        val[valcnt++] = $11
        val[valcnt++] = $12
        val[valcnt++] = $13
        val[valcnt++] = $14
        val[valcnt++] = $15
        val[valcnt++] = $3
        val[valcnt++] = $4
        val[valcnt++] = $5
        val[valcnt++] = $6
        val[valcnt++] = $7
        val[valcnt++] = $8
        val[valcnt++] = $9
        val[valcnt++] = $10
        next
    }
    /_START /{
        # these are headers - save them to head, to be reused later
        for (i = 2; i <= NF; ++i) {
            # fun fact: its indexed on NF
            head[i] = $i
        }
        next
    }
    
    # this function is redundant, but its just easier for me to think about the code
    function output(firstval, header, value, \
            cur, i) {
        cur = valcnt
        val[cur++] = firstval
        val[cur++] = header
        val[cur++] = value
        # output val as csv
        for (i = 1; i < cur; ++i) {
            printf "%s%s", val[i], i != cur - 1 ? "," : "\n"
        }
    }
    
    /[0-9]+/{
        for (i = 2; i <= NF; ++i) {
            # add these 3 to all the other values and output them
            # ie. add first column, the header from header and the value
            output($1, head[i], $i)
        }
    }
    
    '
    
    awk'
    开始{
    #输出标题行
    打印“HH1、HH2、HH3、HH4、HH5、HH6、HH7、HH8、HH9、HH10、HH11、HH12、HH13、HH14、开始、标题名称、值”
    }
    #忽略注释行
    //{next}
    /!!G/{
    valcnt=1
    #保存并洗牌这些值
    val[valcnt++]=$2
    val[valcnt++]=$11
    val[valcnt++]=12美元
    val[valcnt++]=$13
    val[valcnt++]=$14
    val[valcnt++]=$15
    val[valcnt++]=$3
    val[valcnt++]=$4
    val[valcnt++]=$5
    val[valcnt++]=$6
    val[valcnt++]=$7
    val[valcnt++]=$8
    val[valcnt++]=9美元
    val[valcnt++]=10美元
    下一个
    }
    /_开始/{
    #这些是标题-将它们保存到标题,以便以后重用
    
    对于(i=2;如上所述,它在
    output
    中的循环末尾过早跳行,如:
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,%,10,Header12,44.28
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,STATE1,Header13,16.57
    99999,CURRENT,XYZ,ABC,STATE1,STATE1,STATE1,%,%,%,%,Header1,STATE2,%,%,%,%,Header1,%,%4
    ,11.52
    我可以通过将最后一个函数中的等式更改为:
    /[0-9]+/{for(I=2;I
    它将删除一行,您知道吗?请确保没有dos行结尾。啊,您是正确的。该更改将删除两个块中的
    头14
    行。我必须找到另一种方法来防止行跳过
    HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header1,12.23
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header2,1.91
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header3,6.63
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header4,1.68
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header5,50.03
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header6,0.5
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,10,Header7,13.97
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header1,11.32
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header2,1.94
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header3,6.64
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header4,1.94
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header5,50.12
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header6,0.58
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,11,Header7,15.1
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header1,12.96
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header2,2.15
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header3,6.57
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header4,2.12
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header5,55.6
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header6,0.62
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,12,Header7,16.24
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header1,11.43
    99999,CURRENT,XYZ,ABC,STATE1,STATE2,%,%,%,%,%,%,%,%,13,Header2,2.18
    ...
    
    
    awk '
    BEGIN{
        # output the header line
        print "HH1,HH2,HH3,HH4,HH5,HH6,HH7,HH8,HH9,HH10,HH11,HH12,HH13,HH14,START,HeaderName,Value"
    }
    # ignore comment lines
    /;/{next}
    
    /!!G/{
        valcnt = 1
        # save and shuffle the values
        val[valcnt++] = $2
        val[valcnt++] = $11
        val[valcnt++] = $12
        val[valcnt++] = $13
        val[valcnt++] = $14
        val[valcnt++] = $15
        val[valcnt++] = $3
        val[valcnt++] = $4
        val[valcnt++] = $5
        val[valcnt++] = $6
        val[valcnt++] = $7
        val[valcnt++] = $8
        val[valcnt++] = $9
        val[valcnt++] = $10
        next
    }
    /_START /{
        # these are headers - save them to head, to be reused later
        for (i = 2; i <= NF; ++i) {
            # fun fact: its indexed on NF
            head[i] = $i
        }
        next
    }
    
    # this function is redundant, but its just easier for me to think about the code
    function output(firstval, header, value, \
            cur, i) {
        cur = valcnt
        val[cur++] = firstval
        val[cur++] = header
        val[cur++] = value
        # output val as csv
        for (i = 1; i < cur; ++i) {
            printf "%s%s", val[i], i != cur - 1 ? "," : "\n"
        }
    }
    
    /[0-9]+/{
        for (i = 2; i <= NF; ++i) {
            # add these 3 to all the other values and output them
            # ie. add first column, the header from header and the value
            output($1, head[i], $i)
        }
    }
    
    '