Csv带gawk的可选报价
我需要使用“FPAT”或gawk的等效函数“patsplit”。但在CentOs服务器上安装的gawk版本似乎是3.1.5 我尝试使用以下命令更新gawk:Csv带gawk的可选报价,csv,awk,gawk,Csv,Awk,Gawk,我需要使用“FPAT”或gawk的等效函数“patsplit”。但在CentOs服务器上安装的gawk版本似乎是3.1.5 我尝试使用以下命令更新gawk: yum update gawk; 服务器显示:“没有标记为更新的包” 我还尝试使用以下工具重新安装gawk: yum install gawk; 服务器输出:“包gawk-3.1.5-15.el5.x86_64已安装且为最新版本 " 我需要gawk 4.0或更高版本才能使用这些FPAT或patsplit。为什么我需要使用它们?我正在尝
yum update gawk;
服务器显示:“没有标记为更新的包”
我还尝试使用以下工具重新安装gawk:
yum install gawk;
服务器输出:“包gawk-3.1.5-15.el5.x86_64已安装且为最新版本
"
我需要gawk 4.0或更高版本才能使用这些FPAT或patsplit。为什么我需要使用它们?我正在尝试处理一个CSV文件,似乎CSV文件有可选的引号和嵌入的逗号
例如:
从如下所示的csv行:
this,is,a,"csv,with,embedded coma"
this
is
a
"csv,with,embedded comma"
我需要像这样拆分字段:
this,is,a,"csv,with,embedded coma"
this
is
a
"csv,with,embedded comma"
下面是gawk代码:
awk '{patsplit("this,is,a,\"csv,with,embedded comma\"",a,"([^,]*)|(\"([^\"]|\"\")+\"[^,]*)",seps); for(i=0;i<length(a);i++) print a[i];}';
awk'{patsplit(“this,is,a,\'csv,with,embedded comma\”,a,“([^,]*)|(\”([^\“]\\”)+\“[^,]*”),seps);for(i=0;i我想我们可以使用match()来获取字段
代码如下:
awk '{ $0=$0","
while($0) {
match($0,/ *"[^"]*" *,|[^,]*,/)
field=substr($0,RSTART,RLENGTH)
gsub(/,$/,"",field)
print field
$0=substr($0,RLENGTH+1)
}}' file
使用输入示例进行测试:
kent$ echo 'this,is,a,"csv,with,embedded coma"'|awk '{
$0=$0","
while($0) {
match($0,/ *"[^"]*" *,|[^,]*,/)
field=substr($0,RSTART,RLENGTH)
gsub(/,$/,"",field)
print field
$0=substr($0,RLENGTH+1)
}}'
this
is
a
"csv,with,embedded coma"
我想我们可以使用match()来获取字段
代码如下:
awk '{ $0=$0","
while($0) {
match($0,/ *"[^"]*" *,|[^,]*,/)
field=substr($0,RSTART,RLENGTH)
gsub(/,$/,"",field)
print field
$0=substr($0,RLENGTH+1)
}}' file
使用输入示例进行测试:
kent$ echo 'this,is,a,"csv,with,embedded coma"'|awk '{
$0=$0","
while($0) {
match($0,/ *"[^"]*" *,|[^,]*,/)
field=substr($0,RSTART,RLENGTH)
gsub(/,$/,"",field)
print field
$0=substr($0,RLENGTH+1)
}}'
this
is
a
"csv,with,embedded coma"
最简单的方法是在进行实际处理之前,将引号外的逗号转换为其他逗号。例如:
$ cat file
this,is,a,"csv,with,embedded coma",and,here,"is,another",one
and,here,"is,another,line"
$
$ awk 'BEGIN{FS=OFS="\""}{for (i=1;i<=NF;i+=2) gsub(/,/,";",$i)}1' file
this;is;a;"csv,with,embedded coma";and;here;"is,another";one
and;here;"is,another,line"
$cat文件
这是一个“csv,带有嵌入式昏迷”,这里是另一个
还有,这里,“是,另一行”
$
$awk'BEGIN{FS=OFS=“\”}{for(i=1;i最简单的方法是在进行实际处理之前将引号外的逗号转换为其他逗号。例如:
$ cat file
this,is,a,"csv,with,embedded coma",and,here,"is,another",one
and,here,"is,another,line"
$
$ awk 'BEGIN{FS=OFS="\""}{for (i=1;i<=NF;i+=2) gsub(/,/,";",$i)}1' file
this;is;a;"csv,with,embedded coma";and;here;"is,another";one
and;here;"is,another,line"
$cat文件
这是一个“csv,带有嵌入式昏迷”,这里是另一个
还有,这里,“是,另一行”
$
$awk'BEGIN{FS=OFS=“\”}{for(i=1;i尝试在管道中使用csvquote使数据易于awk解释。这是我编写的脚本,它用非打印字符替换带引号字段中的逗号,然后还原它们
因此,如果您的awk命令最初看起来像这样:
awk -F, '{print $3 "," $5}' inputfile.csv
…可以使用csv引用的分隔符,如下所示:
csvquote inputfile.csv | awk -F, '{print $3 "," $5}' | csvquote -u
有关代码和更多文档,请参见尝试在管道中使用csvquote,以使awk易于解释数据。这是我编写的脚本,它用非打印字符替换带引号字段中的逗号,然后还原它们
因此,如果您的awk命令最初看起来像这样:
awk -F, '{print $3 "," $5}' inputfile.csv
…可以使用csv引用的分隔符,如下所示:
csvquote inputfile.csv | awk -F, '{print $3 "," $5}' | csvquote -u
有关代码和更多文档,请参见以下纯GAWK解决方案:
{ # Split on double quotes to handle lines like "this, this, or this".
printf("LINE: '%s'\nFIELDS:", $0)
n = split($0,q,/"/)
f = 0
}
n == 1 { # If n is 1, there are no double quotes on the line.
n = split($0,c,/,/)
for (i = 1; i <= n; i++) {
printf(" %d='%s'", i, c[i])
}
printf("\n")
next
}
{ # There are "strings"; the EVEN entries in q are the quoted strings.
for (i = 1; i <= n; i++) {
if (0 == and(i,1)) { # i is EVEN: This is a double-quoted string.
printf(" %d='\"%s\"'", ++f, q[i])
continue
}
if (0 == length(q[i])) { # First/last field is a quoted string.
continue
}
if (q[i] == ",") {
# First/last field empty, or comma between two quoted strings.
if (i == 1 || i == n) { # First/last field empty
printf(" %d=''", ++f)
}
continue
}
# Remove commas before/after a quoted string then split on commas.
sub(/^,/,"",q[i])
sub(/,$/,"",q[i])
m = split(q[i],cq,/,/)
for (j = 1; j <= m; j++) {
printf(" %d='%s'", ++f, cq[j])
}
}
printf("\n")
}
该输出是:
LINE: 'This is one,23,$9.32,Another string.'
FIELDS: 1='This is one' 2='23' 3='$9.32' 4='Another string.'
LINE: 'Line 2,234,$88.34,Blah blah'
FIELDS: 1='Line 2' 2='234' 3='$88.34' 4='Blah blah'
LINE: '"This is another",763,$0.00,"trouble, or not?"'
FIELDS: 1='"This is another"' 2='763' 3='$0.00' 4='"trouble, or not?"'
LINE: '"This is, perhaps, trouble too...",763,$0.00,"trouble, or not?"'
FIELDS: 1='"This is, perhaps, trouble too..."' 2='763' 3='$0.00' 4='"trouble, or not?"'
LINE: '2,"This is, perhaps, trouble too...",763,"trouble, or not?"'
FIELDS: 1='2' 2='"This is, perhaps, trouble too..."' 3='763' 4='"trouble, or not?"'
LINE: '3,,"number, number","well?"'
FIELDS: 1='3' 2='' 3='"number, number"' 4='"well?"'
LINE: ',,,'
FIELDS: 1='' 2='' 3='' 4=''
LINE: '"1,one","2,two","3,three","4,four"'
FIELDS: 1='"1,one"' 2='"2,two"' 3='"3,three"' 4='"4,four"'
LINE: '",commas,","no commas",",,,,,",'
FIELDS: 1='",commas,"' 2='"no commas"' 3='",,,,,"' 4=''
LINE: ',"Fields 1 and 4 are empty","But 2 and 3 are not",'
FIELDS: 1='' 2='"Fields 1 and 4 are empty"' 3='"But 2 and 3 are not"' 4=''
下面是一个纯粹的GAWK解决方案:
{ # Split on double quotes to handle lines like "this, this, or this".
printf("LINE: '%s'\nFIELDS:", $0)
n = split($0,q,/"/)
f = 0
}
n == 1 { # If n is 1, there are no double quotes on the line.
n = split($0,c,/,/)
for (i = 1; i <= n; i++) {
printf(" %d='%s'", i, c[i])
}
printf("\n")
next
}
{ # There are "strings"; the EVEN entries in q are the quoted strings.
for (i = 1; i <= n; i++) {
if (0 == and(i,1)) { # i is EVEN: This is a double-quoted string.
printf(" %d='\"%s\"'", ++f, q[i])
continue
}
if (0 == length(q[i])) { # First/last field is a quoted string.
continue
}
if (q[i] == ",") {
# First/last field empty, or comma between two quoted strings.
if (i == 1 || i == n) { # First/last field empty
printf(" %d=''", ++f)
}
continue
}
# Remove commas before/after a quoted string then split on commas.
sub(/^,/,"",q[i])
sub(/,$/,"",q[i])
m = split(q[i],cq,/,/)
for (j = 1; j <= m; j++) {
printf(" %d='%s'", ++f, cq[j])
}
}
printf("\n")
}
该输出是:
LINE: 'This is one,23,$9.32,Another string.'
FIELDS: 1='This is one' 2='23' 3='$9.32' 4='Another string.'
LINE: 'Line 2,234,$88.34,Blah blah'
FIELDS: 1='Line 2' 2='234' 3='$88.34' 4='Blah blah'
LINE: '"This is another",763,$0.00,"trouble, or not?"'
FIELDS: 1='"This is another"' 2='763' 3='$0.00' 4='"trouble, or not?"'
LINE: '"This is, perhaps, trouble too...",763,$0.00,"trouble, or not?"'
FIELDS: 1='"This is, perhaps, trouble too..."' 2='763' 3='$0.00' 4='"trouble, or not?"'
LINE: '2,"This is, perhaps, trouble too...",763,"trouble, or not?"'
FIELDS: 1='2' 2='"This is, perhaps, trouble too..."' 3='763' 4='"trouble, or not?"'
LINE: '3,,"number, number","well?"'
FIELDS: 1='3' 2='' 3='"number, number"' 4='"well?"'
LINE: ',,,'
FIELDS: 1='' 2='' 3='' 4=''
LINE: '"1,one","2,two","3,three","4,four"'
FIELDS: 1='"1,one"' 2='"2,two"' 3='"3,three"' 4='"4,four"'
LINE: '",commas,","no commas",",,,,,",'
FIELDS: 1='",commas,"' 2='"no commas"' 3='",,,,,"' 4=''
LINE: ',"Fields 1 and 4 are empty","But 2 and 3 are not",'
FIELDS: 1='' 2='"Fields 1 and 4 are empty"' 3='"But 2 and 3 are not"' 4=''
如果您有权访问yum(现在应称为dnf),请参阅
然后您可以作为普通用户运行
git clone https://git.savannah.gnu.org/git/gawk.git
cd gawk
./configure
make
sudo make install
然后你的问题是一个tmp.txt的输入文件
this,is,a,"csv,with,embedded coma"
很容易被
gawk '{patsplit("this,is,a,\"csv,with,embedded comma\"",a,"([^,]*)|(\"([^\"]|\"\")+\"[^,]*)",seps); for(i=0;i<length(a);i++) print a[i];}' tmp.txt
gawk'{patsplit(“this,is,a,\'csv,with,embedded comma\”,a,“([^,]*)|(\”([^\“]\\\”)+\“[^,]*”),seps);对于(i=0;i如果您有权访问yum,现在应该称为dnf,请参阅
然后您可以作为普通用户运行
git clone https://git.savannah.gnu.org/git/gawk.git
cd gawk
./configure
make
sudo make install
然后你的问题是一个tmp.txt的输入文件
this,is,a,"csv,with,embedded coma"
很容易被
gawk '{patsplit("this,is,a,\"csv,with,embedded comma\"",a,"([^,]*)|(\"([^\"]|\"\")+\"[^,]*)",seps); for(i=0;i<length(a);i++) print a[i];}' tmp.txt
gawk'{patsplit(“这是一个,\'csv,带,嵌入逗号\”“,a,”([^,]*))(\”([^\“]\\”)+\“[^,]*”,seps”);用于(i=0;iOoh,+1用于在用引号分隔的每秒钟字段中替换!我保留该字段。:)噢,+1用于在用引号分隔的每秒钟字段中替换!我保留该字段。:)