awk:使用多列数据的操作
以下AWK脚本(bash代码的一部分)从input.csv的选定列中提取数字,并对这些数字执行一些简单的统计操作,最终将结果保存为output.csv中的一行:awk:使用多列数据的操作,awk,Awk,以下AWK脚本(bash代码的一部分)从input.csv的选定列中提取数字,并对这些数字执行一些简单的统计操作,最终将结果保存为output.csv中的一行: awk -F ", *" ' # set field separator to comma, followed by 0 or more whitespaces FNR==1 { if (n) { # calculate the resu
awk -F ", *" ' # set field separator to comma, followed by 0 or more whitespaces
FNR==1 {
if (n) { # calculate the results of previous file
m = s / n # mean
var = s2 / n - m * m # variance
if (var < 0) var = 0 # avoid an exception due to round-off error
mean[suffix] = m # store the mean in an array
rmsd[suffix] = sqrt(var)
lowest[suffix] = min # lowest dG
highest[suffix] = fourth # dG in cluster with highest pop
}
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix)
sub(/^.*_/, "", suffix)
s = 0 # sum of $3
s2 = 0 # sum of $3 ** 2
n = 0 # count of samples
min = 0 # lowest value of $3 (assuming all $3 < 0)
max = 0 # highest value of $2 (assuming all $2 > 0)
}
FNR > 1 {
s += $3
s2 += $3 * $3
++n
if ($3 < min) min = $3 # update the lowest value
if ($2 > max) {
max = $2 # update popMAX
fourth = $3 # update the value of dG corresponded to topPOP
}
}
END {
if (n) { # just to avoid division by zero
m = s / n
var = s2 / n - m * m
if (var < 0) var = 0
mean[suffix] = m
rmsd[suffix] = sqrt(var)
lowest[suffix] = min # most negative dG
highest[suffix] = fourth # dG in a cluster with pop(MAX)
}
print "Lig(CNE)", "dG(min)", "dG(popMAX)", "dG(mean)"
for (i in mean)
printf "%s %.2f %.2f %.2f\n", i, lowest[i], highest[i], mean[i]
}' input.csv > output.csv
最后,它将结果保存在另一个multi-column output.csv文件中,该文件包含每个已处理csv名称的一部分(对应的前缀用作行的ID),以及关于其dG(min)、dG(popMAX)以及为第三(dG)列中的所有数字计算的平均值的信息:
因此dG(min)是ID=1的input.csv(最低dG)行中$2(dG)的数量,dG(popMAX)对应于在该行中检测到的值dG,该值在$2(POP)中最高
我需要修改脚本的AWK部分,在output.csv中添加两列额外的内容,其中包含关于每个对应dG值的input.csv(POP)第二列的信息(该信息取自同一日志的第三列)。所以相同的日志应该是这样的
# output.csv
Lig(CNE). dG(min) POP(min) dG(popMAX) POP(max) dG(mean)
lig12 -5.65 (142) -4.12 (150) −5.055
在其他世界中,除了在第三列上执行的操作之外,我还需要考虑第二列中的数字,然后在OutPut.CSV中对它们进行匹配:因此POP(min)应该从第一行的2美元(带有DG(min))和POP(max)从2美元的DG(Popmax)行获得。 我尝试使用
'{print $2}'
但是结果output.csv与原始input.csv的行顺序不匹配(例如,它从不属于dG(min)等的行中取出第二列)请尝试:
awk -F ", *" ' # set field separator to comma, followed by 0 or more whitespaces
FNR==1 {
if (n) { # calculate the results of previous file
m = s / n # mean
var = s2 / n - m * m # variance
if (var < 0) var = 0 # avoid an exception due to round-off error
mean[suffix] = m # store the mean in an array
rmsd[suffix] = sqrt(var)
lowest[suffix] = min # lowest dG
highest[suffix] = fourth # dG in cluster with highest pop
pop_min[suffix] = popmin # pop in cluster with lowest dG
pop_max[suffix] = max # highest pop
}
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix)
sub(/^.*_/, "", suffix)
s = 0 # sum of $3
s2 = 0 # sum of $3 ** 2
n = 0 # count of samples
min = 0 # lowest value of $3 (assuming all $3 < 0)
max = 0 # highest value of $2 (assuming all $2 > 0)
}
FNR > 1 {
s += $3
s2 += $3 * $3
++n
if ($3 < min) {
min = $3 # update the lowest value
popmin = $2 # newly introduced variable
}
if ($2 > max) {
max = $2 # update popMAX
fourth = $3 # update the value of dG corresponded to topPOP
}
}
END {
if (n) { # just to avoid division by zero
m = s / n
var = s2 / n - m * m
if (var < 0) var = 0
mean[suffix] = m
rmsd[suffix] = sqrt(var)
lowest[suffix] = min # most negative dG
highest[suffix] = fourth # dG in a cluster with pop(MAX)
pop_min[suffix] = popmin # pop in cluster with lowest dG
pop_max[suffix] = max # highest pop
}
print "Lig(CNE)", "dG(min)", "POP(dGmin)", "dG(popMAX)", "POP(max)", "dG(mean)"
for (i in mean)
printf "%s %.2f (%d) %.2f (%d) %.2f\n", i, lowest[i], pop_min[i], highest[i], pop_max[i], mean[i]
}' input.csv
awk-F“,*”#将字段分隔符设置为逗号,后跟0或更多空格
FNR==1{
如果(n){#计算上一个文件的结果
m=s/n#平均值
var=s2/n-m*m#方差
如果(var<0)var=0#避免因舍入错误而出现异常
平均值[后缀]=m#将平均值存储在数组中
rmsd[后缀]=sqrt(变量)
最低[后缀]=最小#最低dG
最高[后缀]=具有最高pop的集群中的第四个#dG
pop_min[后缀]=popmin#具有最低dG的集群中的pop
pop_max[后缀]=max#最高pop
}
前缀=后缀=文件名
sub(/173/,“”,前缀)
子(/\/[^\/]+$/,“”,后缀)
子(/^..*./,“”,后缀)
s=0#总计3美元
s2=0#总金额为$3**2
n=0#样本计数
最小值=0#最低值为$3(假设所有$3<0)
max=0#最高值为$2(假设所有$2>0)
}
FNR>1{
s+=3美元
s2+=$3*$3
++n
如果($3<分钟){
最小值=$3#更新最低值
popmin=$2#新引入的变量
}
如果($2>最大值){
max=$2#更新popMAX
第四=3美元#更新与topPOP对应的dG值
}
}
结束{
如果(n){#只是为了避免被零除
m=序列号
var=s2/n-m*m
如果(var<0)var=0
平均值[后缀]=m
rmsd[后缀]=sqrt(变量)
最低[后缀]=最小值#最负dG
最高[后缀]=具有pop的集群中的第四个#dG(最大值)
pop_min[后缀]=popmin#具有最低dG的集群中的pop
pop_max[后缀]=max#最高pop
}
打印“Lig(CNE)”、“dG(最小值)”、“POP(最小值)”、“dG(最大值)”、“POP(最大值)”、“dG(平均值)”
因为(我的意思是)
printf“%s%.2f(%d)%.2f(%d)%.2f\n”,i,最低[i],最低[i],最高[i],最高[i],最高[i],平均[i]
}'input.csv
- 与
关联的最高pop已分配给 现有变量dG(popMAX)
max
- 已引入变量
来保存pop值 与最低dG相关。只要popmin
(最低dG)已更新min
min
或max
越来越少
由于重复的扩展,不言自明。也许最好是
重命名它们和/或重构代码以备将来维护。您确实意识到,
for(我的意思是)
循环以与创建时不同的顺序遍历关联数组?您是对的!非常感谢您的评论,我刚刚编辑了AWK代码,实际上我忘了在那里添加一部分AWK脚本,它计算dG列的平均值并将值存储在数组中!它工作得很好!是的,这个问题确实是由于变量和循环通过最后的平均数组。。。非常感谢!!只要一个问题,我就可以用数千个CSV测试脚本。假设我们现在已经pop_max[suffix]及其对应的d(G),定义为最高的[suffix]。如何将此行的ID另外定义为一个单独的值(并在输出中打印),该ID通常位于第一列的input.csv中(在该示例中为ID=4)?非常感谢!!我想你已经知道怎么做了。否则,您需要在需要修改脚本时重复询问。请自己试试,祝你好运!如果您在尝试后遇到问题,请随时再次询问。例如,通过引入一个新变量id[后缀]=id,id=$1,或者它应该与其他变量一起每次更新?
'{print $2}'
awk -F ", *" ' # set field separator to comma, followed by 0 or more whitespaces
FNR==1 {
if (n) { # calculate the results of previous file
m = s / n # mean
var = s2 / n - m * m # variance
if (var < 0) var = 0 # avoid an exception due to round-off error
mean[suffix] = m # store the mean in an array
rmsd[suffix] = sqrt(var)
lowest[suffix] = min # lowest dG
highest[suffix] = fourth # dG in cluster with highest pop
pop_min[suffix] = popmin # pop in cluster with lowest dG
pop_max[suffix] = max # highest pop
}
prefix=suffix=FILENAME
sub(/_.*/, "", prefix)
sub(/\/[^\/]+$/, "", suffix)
sub(/^.*_/, "", suffix)
s = 0 # sum of $3
s2 = 0 # sum of $3 ** 2
n = 0 # count of samples
min = 0 # lowest value of $3 (assuming all $3 < 0)
max = 0 # highest value of $2 (assuming all $2 > 0)
}
FNR > 1 {
s += $3
s2 += $3 * $3
++n
if ($3 < min) {
min = $3 # update the lowest value
popmin = $2 # newly introduced variable
}
if ($2 > max) {
max = $2 # update popMAX
fourth = $3 # update the value of dG corresponded to topPOP
}
}
END {
if (n) { # just to avoid division by zero
m = s / n
var = s2 / n - m * m
if (var < 0) var = 0
mean[suffix] = m
rmsd[suffix] = sqrt(var)
lowest[suffix] = min # most negative dG
highest[suffix] = fourth # dG in a cluster with pop(MAX)
pop_min[suffix] = popmin # pop in cluster with lowest dG
pop_max[suffix] = max # highest pop
}
print "Lig(CNE)", "dG(min)", "POP(dGmin)", "dG(popMAX)", "POP(max)", "dG(mean)"
for (i in mean)
printf "%s %.2f (%d) %.2f (%d) %.2f\n", i, lowest[i], pop_min[i], highest[i], pop_max[i], mean[i]
}' input.csv