awk:使用多列数据的操作

awk:使用多列数据的操作,awk,Awk,以下AWK脚本(bash代码的一部分)从input.csv的选定列中提取数字,并对这些数字执行一些简单的统计操作,最终将结果保存为output.csv中的一行: awk -F ", *" ' # set field separator to comma, followed by 0 or more whitespaces FNR==1 { if (n) { # calculate the resu

以下AWK脚本(bash代码的一部分)从input.csv的选定列中提取数字,并对这些数字执行一些简单的统计操作,最终将结果保存为output.csv中的一行:

awk -F ", *" '                  # set field separator to comma, followed by 0 or more whitespaces
FNR==1 {
   if (n) {                     # calculate the results of previous file
      m = s / n                 # mean
      var = s2 / n - m * m      # variance
      if (var < 0) var = 0      # avoid an exception due to round-off error
      mean[suffix] = m          # store the mean in an array
      rmsd[suffix] = sqrt(var)
      lowest[suffix] = min      # lowest dG
      highest[suffix] = fourth  # dG in cluster with highest pop
   }
   prefix=suffix=FILENAME
   sub(/_.*/, "", prefix)
   sub(/\/[^\/]+$/, "", suffix)
   sub(/^.*_/, "", suffix)
   s = 0                        # sum of $3
   s2 = 0                       # sum of $3 ** 2
   n = 0                        # count of samples
   min = 0                      # lowest value of $3 (assuming all $3 < 0)
   max = 0                      # highest value of $2 (assuming all $2 > 0)
}
FNR > 1 {
   s += $3
   s2 += $3 * $3
   ++n
   if ($3 < min) min = $3       # update the lowest value
   if ($2 > max) {
      max = $2                  # update popMAX
      fourth = $3               # update the value of dG corresponded to topPOP
   }
}
END {
  if (n) {                     # just to avoid division by zero
   m = s / n
   var = s2 / n - m * m
   if (var < 0) var = 0
   mean[suffix] = m
   rmsd[suffix] = sqrt(var)
   lowest[suffix] = min     # most negative dG
   highest[suffix] = fourth  # dG in a cluster with pop(MAX)
  }
   print "Lig(CNE)", "dG(min)", "dG(popMAX)", "dG(mean)"
   for (i in mean)
      printf "%s %.2f %.2f %.2f\n", i, lowest[i],  highest[i], mean[i]
}'  input.csv > output.csv
最后,它将结果保存在另一个multi-column output.csv文件中,该文件包含每个已处理csv名称的一部分(对应的前缀用作行的ID),以及关于其dG(min)、dG(popMAX)以及为第三(dG)列中的所有数字计算的平均值的信息:

因此dG(min)是ID=1的input.csv(最低dG)行中$2(dG)的数量,dG(popMAX)对应于在该行中检测到的值dG,该值在$2(POP)中最高

我需要修改脚本的AWK部分,在output.csv中添加两列额外的内容,其中包含关于每个对应dG值的input.csv(POP)第二列的信息(该信息取自同一日志的第三列)。所以相同的日志应该是这样的

# output.csv
Lig(CNE).   dG(min) POP(min)    dG(popMAX) POP(max) dG(mean)
lig12       -5.65   (142)       -4.12      (150)    −5.055

在其他世界中,除了在第三列上执行的操作之外,我还需要考虑第二列中的数字,然后在OutPut.CSV中对它们进行匹配:因此POP(min)应该从第一行的2美元(带有DG(min))和POP(max)从2美元的DG(Popmax)行获得。 我尝试使用

'{print $2}'
但是结果output.csv与原始input.csv的行顺序不匹配(例如,它从不属于dG(min)等的行中取出第二列)

请尝试:

awk -F ", *" '                  # set field separator to comma, followed by 0 or more whitespaces
FNR==1 {
   if (n) {                     # calculate the results of previous file
      m = s / n                 # mean
      var = s2 / n - m * m      # variance
      if (var < 0) var = 0      # avoid an exception due to round-off error
      mean[suffix] = m          # store the mean in an array
      rmsd[suffix] = sqrt(var)
      lowest[suffix] = min      # lowest dG
      highest[suffix] = fourth  # dG in cluster with highest pop
      pop_min[suffix] = popmin  # pop in cluster with lowest dG
      pop_max[suffix] = max     # highest pop
   }
   prefix=suffix=FILENAME
   sub(/_.*/, "", prefix)
   sub(/\/[^\/]+$/, "", suffix)
   sub(/^.*_/, "", suffix)
   s = 0                        # sum of $3
   s2 = 0                       # sum of $3 ** 2
   n = 0                        # count of samples
   min = 0                      # lowest value of $3 (assuming all $3 < 0)
   max = 0                      # highest value of $2 (assuming all $2 > 0)
}
FNR > 1 {
   s += $3
   s2 += $3 * $3
   ++n
   if ($3 < min) {
      min = $3                  # update the lowest value
      popmin = $2               # newly introduced variable
   }
   if ($2 > max) {
      max = $2                  # update popMAX
      fourth = $3               # update the value of dG corresponded to topPOP
   }
}
END {
   if (n) {                     # just to avoid division by zero
      m = s / n
      var = s2 / n - m * m
      if (var < 0) var = 0
      mean[suffix] = m
      rmsd[suffix] = sqrt(var)
      lowest[suffix] = min      # most negative dG
      highest[suffix] = fourth  # dG in a cluster with pop(MAX)
      pop_min[suffix] = popmin  # pop in cluster with lowest dG
      pop_max[suffix] = max     # highest pop
   }
      print "Lig(CNE)", "dG(min)", "POP(dGmin)", "dG(popMAX)", "POP(max)", "dG(mean)"
   for (i in mean)
      printf "%s %.2f (%d) %.2f (%d) %.2f\n", i, lowest[i], pop_min[i], highest[i], pop_max[i], mean[i]
}' input.csv
awk-F“,*”#将字段分隔符设置为逗号,后跟0或更多空格
FNR==1{
如果(n){#计算上一个文件的结果
m=s/n#平均值
var=s2/n-m*m#方差
如果(var<0)var=0#避免因舍入错误而出现异常
平均值[后缀]=m#将平均值存储在数组中
rmsd[后缀]=sqrt(变量)
最低[后缀]=最小#最低dG
最高[后缀]=具有最高pop的集群中的第四个#dG
pop_min[后缀]=popmin#具有最低dG的集群中的pop
pop_max[后缀]=max#最高pop
}
前缀=后缀=文件名
sub(/173/,“”,前缀)
子(/\/[^\/]+$/,“”,后缀)
子(/^..*./,“”,后缀)
s=0#总计3美元
s2=0#总金额为$3**2
n=0#样本计数
最小值=0#最低值为$3(假设所有$3<0)
max=0#最高值为$2(假设所有$2>0)
}
FNR>1{
s+=3美元
s2+=$3*$3
++n
如果($3<分钟){
最小值=$3#更新最低值
popmin=$2#新引入的变量
}
如果($2>最大值){
max=$2#更新popMAX
第四=3美元#更新与topPOP对应的dG值
}
}
结束{
如果(n){#只是为了避免被零除
m=序列号
var=s2/n-m*m
如果(var<0)var=0
平均值[后缀]=m
rmsd[后缀]=sqrt(变量)
最低[后缀]=最小值#最负dG
最高[后缀]=具有pop的集群中的第四个#dG(最大值)
pop_min[后缀]=popmin#具有最低dG的集群中的pop
pop_max[后缀]=max#最高pop
}
打印“Lig(CNE)”、“dG(最小值)”、“POP(最小值)”、“dG(最大值)”、“POP(最大值)”、“dG(平均值)”
因为(我的意思是)
printf“%s%.2f(%d)%.2f(%d)%.2f\n”,i,最低[i],最低[i],最高[i],最高[i],最高[i],平均[i]
}'input.csv
  • dG(popMAX)
    关联的最高pop已分配给 现有变量
    max
  • 已引入变量
    popmin
    来保存pop值 与最低dG相关。只要
    min
    (最低dG)已更新
顺便说一句,变量名如
min
max
越来越少 由于重复的扩展,不言自明。也许最好是
重命名它们和/或重构代码以备将来维护。

您确实意识到,
for(我的意思是)
循环以与创建时不同的顺序遍历关联数组?您是对的!非常感谢您的评论,我刚刚编辑了AWK代码,实际上我忘了在那里添加一部分AWK脚本,它计算dG列的平均值并将值存储在数组中!它工作得很好!是的,这个问题确实是由于变量和循环通过最后的平均数组。。。非常感谢!!只要一个问题,我就可以用数千个CSV测试脚本。假设我们现在已经pop_max[suffix]及其对应的d(G),定义为最高的[suffix]。如何将此行的ID另外定义为一个单独的值(并在输出中打印),该ID通常位于第一列的input.csv中(在该示例中为ID=4)?非常感谢!!我想你已经知道怎么做了。否则,您需要在需要修改脚本时重复询问。请自己试试,祝你好运!如果您在尝试后遇到问题,请随时再次询问。例如,通过引入一个新变量id[后缀]=id,id=$1,或者它应该与其他变量一起每次更新?
'{print $2}'
awk -F ", *" '                  # set field separator to comma, followed by 0 or more whitespaces
FNR==1 {
   if (n) {                     # calculate the results of previous file
      m = s / n                 # mean
      var = s2 / n - m * m      # variance
      if (var < 0) var = 0      # avoid an exception due to round-off error
      mean[suffix] = m          # store the mean in an array
      rmsd[suffix] = sqrt(var)
      lowest[suffix] = min      # lowest dG
      highest[suffix] = fourth  # dG in cluster with highest pop
      pop_min[suffix] = popmin  # pop in cluster with lowest dG
      pop_max[suffix] = max     # highest pop
   }
   prefix=suffix=FILENAME
   sub(/_.*/, "", prefix)
   sub(/\/[^\/]+$/, "", suffix)
   sub(/^.*_/, "", suffix)
   s = 0                        # sum of $3
   s2 = 0                       # sum of $3 ** 2
   n = 0                        # count of samples
   min = 0                      # lowest value of $3 (assuming all $3 < 0)
   max = 0                      # highest value of $2 (assuming all $2 > 0)
}
FNR > 1 {
   s += $3
   s2 += $3 * $3
   ++n
   if ($3 < min) {
      min = $3                  # update the lowest value
      popmin = $2               # newly introduced variable
   }
   if ($2 > max) {
      max = $2                  # update popMAX
      fourth = $3               # update the value of dG corresponded to topPOP
   }
}
END {
   if (n) {                     # just to avoid division by zero
      m = s / n
      var = s2 / n - m * m
      if (var < 0) var = 0
      mean[suffix] = m
      rmsd[suffix] = sqrt(var)
      lowest[suffix] = min      # most negative dG
      highest[suffix] = fourth  # dG in a cluster with pop(MAX)
      pop_min[suffix] = popmin  # pop in cluster with lowest dG
      pop_max[suffix] = max     # highest pop
   }
      print "Lig(CNE)", "dG(min)", "POP(dGmin)", "dG(popMAX)", "POP(max)", "dG(mean)"
   for (i in mean)
      printf "%s %.2f (%d) %.2f (%d) %.2f\n", i, lowest[i], pop_min[i], highest[i], pop_max[i], mean[i]
}' input.csv