Awk 通过搜索每行的所有列并将其写入输出文件中的指定列来提取某些模式
我是一名编程初学者,被赋予了一项任务,即使用awk提取文本文件中“INFO”列中的某些字符串。代码如下:Awk 通过搜索每行的所有列并将其写入输出文件中的指定列来提取某些模式,awk,Awk,我是一名编程初学者,被赋予了一项任务,即使用awk提取文本文件中“INFO”列中的某些字符串。代码如下: awk -F '\t' '/^[^#]/ {n=split($8,a,/[;]/); for(i=1;i<=n;i++) {if(a[i] ~ /^CLNDN=/) printf("%s\t",a[i]); else if(a[i] ~ /^CLNREVSTAT=/) printf("%s\t",a[i]); else if(a[i] ~ /^
awk -F '\t' '/^[^#]/ {n=split($8,a,/[;]/); for(i=1;i<=n;i++) {if(a[i] ~ /^CLNDN=/) printf("%s\t",a[i]); else if(a[i] ~ /^CLNREVSTAT=/) printf("%s\t",a[i]); else if(a[i] ~ /^CLNSIG=/) printf("%s\t",a[i]);else if(a[i] ~ /^CLNSIGCONF=/) printf("%s\t",a[i]); else if(a[i] ~ /^ORIGIN=/) printf("%s\t",a[i]); } printf("\n");}' test.vcf > trial.vcf
下面,您可以看到我想要得到的示例输出
CLNDN=not_provided CLNREVSTAT=criteria_provided,_single_submitter CLNSIG=Pathogenic ORIGIN=1
CLNDN=Myasthenic_syndrome,_congenital,_8|not_specified|not_provided CLNREVSTAT=criteria_provided,_conflicting_interpretations CLNSIG=Conflicting_interpretations_of_pathogenicity CLNSIGCONF=Benign(1),Likely_benign(2),Uncertain_significance(1) ORIGIN=1
第一行中的CLNSIG和原点之间存在间隙,因为该行不包含CLNSIGCONF=信息。
这里,我想提取以CLDN=、CLNREVSTAT=、CLNSIG=、CLNSIGCONF=和ORIGIN=开头的字符串,并将它们分别打印到输出文件的第1-5列。
代码能够提取兴趣,但我坚持要将它们打印到指定的列中
如果您能帮助我,我将不胜感激(并愿意接受任何建议)
非常感谢您。编辑:如果行中缺少任何元素,请尝试以下操作。如果在任何一行中都找不到匹配项,这也将打印一条语句(如果您想删除该语句,则从该解决方案中删除if(cldn==“”&&&..)
块)
awk '
BEGIN{
OFS="\t"
}
match($0,/CLNDN=[^;]*/){
cldn=substr($0,RSTART,RLENGTH)
}
match($0,/CLNREVSTAT=[^;]*/){
clnrevstat=substr($0,RSTART,RLENGTH)
}
match($0,/CLNSIG=[^;]*/){
clnsig=substr($0,RSTART,RLENGTH)
}
match($0,/CLNSIGCONF=[^;]*/){
clnsisconf=substr($0,RSTART,RLENGTH)
}
match($0,/ORIGIN=[^;]*/){
origin=substr($0,RSTART,RLENGTH)
}
NF{
if(cldn=="" && clnrevstat=="" && clnsig=="" && clnsisconf=="" && origin==""){
print "NO matched value found in this line."
next
}
print cldn,clnrevstat,clnsig,clnsisconf,origin
cldn=clnrevstat=clnsig=clnsisconf=origin=""
next
}
1
' Input_file
请您尝试以下内容,这些内容是根据您在GNU
awk
中显示的示例编写的
awk '
BEGIN{
OFS="\t"
}
{ cldn=clnrevstat=clnsig=clnsisconf="" }
match($0,/CLNDN=[^;]*/){
cldn=substr($0,RSTART,RLENGTH)
}
match($0,/CLNREVSTAT=[^;]*/){
clnrevstat=substr($0,RSTART,RLENGTH)
}
match($0,/CLNSIG=[^;]*/){
clnsig=substr($0,RSTART,RLENGTH)
}
match($0,/CLNSIGCONF=[^;]*/){
clnsisconf=substr($0,RSTART,RLENGTH)
}
match($0,/ORIGIN=[^;]*/){
print cldn,clnrevstat,clnsig,clnsisconf,substr($0,RSTART,RLENGTH)
}
' Input_file
说明:添加上述内容的详细说明
awk ' ##Starting awk program from here.
BEGIN{ ##Starting BEGIN section of this program from here.
OFS="\t" ##Setting OFS as tab here.
}
{ cldn=clnrevstat=clnsig=clnsisconf="" }
match($0,/CLNDN=[^;]*/){ ##Using match function to match from string CLNDN= till semi colon here.
cldn=substr($0,RSTART,RLENGTH) ##Creating cldn which has matched regex sub string.
}
match($0,/CLNREVSTAT=[^;]*/){ ##Using match function to match from string CLNREVSTAT= till semi colon here.
clnrevstat=substr($0,RSTART,RLENGTH) ##Creating clnrevstat which has matched regex sub string here.
}
match($0,/CLNSIG=[^;]*/){ ##Using match function to match from string CLNSIG= till semi colon here.
clnsig=substr($0,RSTART,RLENGTH) ##Creating clnsig which has matched regex sub string here.
}
match($0,/CLNSIGCONF=[^;]*/){ ##Using match function to match from string CLNSIGCONF= till semi colon here.
clnsisconf=substr($0,RSTART,RLENGTH) ##Creating clnsisconf which has matched regex sub string here.
}
match($0,/ORIGIN=[^;]*/){ ##Using match function to match from string ORIGIN= till semi colon here.
print cldn,clnrevstat,clnsig,clnsisconf,substr($0,RSTART,RLENGTH)
##Printing all variables value and sub string of matched regex.
}
' Input_file ##Mentioning Input_file name here.
每当您的数据中有
tag=value
对时,最好先构建一个映射数组(f[]
如下),然后按它们的标记(名称)打印所需的值:
FWIW以下是我认为您应该如何真正做到这一点,而不是让每行中的每个字段都包含标记和值:
$ cat tst.awk
BEGIN {
OFS = "\t"
n = split("CLNDN CLNREVSTAT CLNSIG CLNSIGCONF ORIGIN",tags)
for (i=1; i<=n; i++) {
tag = tags[i]
printf "%s%s", tag, (i<n ? OFS : ORS)
}
}
{
delete tag2val
split($NF,tagVals,/;/)
for (i in tagVals) {
tag = val = tagVals[i]
sub(/=.*/,"",tag)
sub(/[^=]+=/,"",val)
tag2val[tag] = val
}
for (i=1; i<=n; i++) {
tag = tags[i]
val = tag2val[tag]
printf "%s%s", val, (i<n ? OFS : ORS)
}
}
很好,你已经在问题中展示了你的努力。你可以发布你的输入文件的样本,以便更好地理解问题。请你更新你的问题,以便更好地理解代码标记中的样本。我更新了帖子。输出格式不是以
CLNDN、CLNREVSTAT等字段名为输出格式吗ader行和它们下面的值比包含字段名和它的值的每一行都好?非常感谢Ed的解决方案!但我没有得到一件事:(这只是一个常见的三元表达式(请参阅)在每个字段后面打印OFS,除了行上的最后一个字段,然后打印ORS。
$ cat tst.awk
BEGIN { OFS="\t" }
NF {
delete f
split($NF,tagVals,/;/)
for (i in tagVals) {
tag = tagVals[i]
sub(/=.*/,"",tag)
f[tag] = tagVals[i]
}
print f["CLNDN"], f["CLNREVSTAT"], f["CLNSIG"], f["CLNSIGCONF"], f["ORIGIN"]
}
$ awk -f tst.awk file
CLNDN=not_provided CLNREVSTAT=criteria_provided,_single_submitter CLNSIG=Pathogenic ORIGIN=1
CLNDN=Myasthenic_syndrome,_congenital,_8|not_specified|not_provided CLNREVSTAT=criteria_provided,_conflicting_interpretations CLNSIG=Conflicting_interpretations_of_pathogenicity CLNSIGCONF=Benign(1),Likely_benign(2),Uncertain_significance(1) ORIGIN=1
$ cat tst.awk
BEGIN {
OFS = "\t"
n = split("CLNDN CLNREVSTAT CLNSIG CLNSIGCONF ORIGIN",tags)
for (i=1; i<=n; i++) {
tag = tags[i]
printf "%s%s", tag, (i<n ? OFS : ORS)
}
}
{
delete tag2val
split($NF,tagVals,/;/)
for (i in tagVals) {
tag = val = tagVals[i]
sub(/=.*/,"",tag)
sub(/[^=]+=/,"",val)
tag2val[tag] = val
}
for (i=1; i<=n; i++) {
tag = tags[i]
val = tag2val[tag]
printf "%s%s", val, (i<n ? OFS : ORS)
}
}
$ awk -f tst.awk file
CLNDN CLNREVSTAT CLNSIG CLNSIGCONF ORIGIN
not_provided criteria_provided,_single_submitter Pathogenic 1
Myasthenic_syndrome,_congenital,_8|not_specified|not_provided criteria_provided,_conflicting_interpretationsConflicting_interpretations_of_pathogenicity Benign(1),Likely_benign(2),Uncertain_significance(1) 1