C 数据提取-这个正则表达式可以做得更好吗?
我有一个C程序,正在解码来自APRSIS服务器的数据。它在GNU/LINUX机器上运行良好 我创建了这个正则表达式来提取天气数据。很长。下面是一个示例数据记录和正则表达式: 数据记录C 数据提取-这个正则表达式可以做得更好吗?,c,regex,gnu,C,Regex,Gnu,我有一个C程序,正在解码来自APRSIS服务器的数据。它在GNU/LINUX机器上运行良好 我创建了这个正则表达式来提取天气数据。很长。下面是一个示例数据记录和正则表达式: 数据记录 KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE21,qAO,WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,N6LXX-10
KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE21,qAO,WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,N6LXX-10:!3410.50N/11828.90W_182/009g012t070P000h30b10220V126OTW1
KM6AHX-12>APOTU0,N6EX-5,qAR,N6LXX-10:!3411.20N/11813.02W_264/002g010t062p001h61T2WX
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_189/010g008t061p001h59T2WX
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,K6LOT-10:!3410.50N/11828.90W_127/008g014t070P000h30b10220V127OTW1
K6OUA-11>APOTW1,WA6ZSN,WIDE2,qAR,N6LXX-10:!3417.39N/11849.36W_225/003g005t066V133P000h45b10138OTW1
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_234/005g008t060p001h59T2WX
AD6NH>APJYC1,TCPIP*,qAC,T2CAWEST:=3352.28N/11749.75W_000/000t065h48b10206 /A=259 https://www.ka2ddo.org/ka2ddo/YAAC.html
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_170/004g013t060p001h60T2WX
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,N6LXX-10:!3410.50N/11828.90W_120/005g012t069P000h30b10220V127OTW1
K9COE-11>APOTW1,W6SCE-10,qAR,N6LXX-10:!3414.63N/11846.70W_105/007g007t065P035h51b10191OTW1
KM6AHX-12>APOTU0,N6EX-5*,qAR,K6LOT-10:!3411.20N/11813.02W_002/001g013t060p001h60T2WX
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_358/003g013t060p001h60T2WX
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,K6LOT-10:!3410.50N/11828.90W_115/004g013t069P000h30b10220V126OTW1
Regex
":[!=][0-9.NS]*/[0-9.EW]*_([0-9]{3})/([0-9]{3})([tphbcsLls#grPV][0-9 .]{2,5})?"
"([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
"([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
"([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
"([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
"([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?.*$"
给定的数据记录可能不包含所有可能的数据类型([tphbcsLls#grPV]),也不保证订单
有更好的方法吗?这似乎有点野蛮
Chuck Bland您可以将解析分为两个步骤:
x123
类型模式分组到单个捕获组中x123
类型模式拆分为单独的捕获组":[!=][0-9.NS]*\/[0-9.EW]*_([0-9]{3})\/([0-9]{3})((?:[tphbcsLls#grPV][0-9 .]{2,5})+)"
正则表达式的解释:
-期望模式选择正确的记录类型:[!=][0-9.NS]*\/[0-9.EW]*.\uu
-捕获组1([0-9]{3})
-斜杠\/
-捕获组2([0-9]{3})
-捕获组3开始(
-非捕获组启动(?:
-预期模式(重复)[tphbcsLls#grPV][0-9.]{2,5}
-非捕获组结束,重复此操作1+次)+
-捕获组3结束)
KG7FOQ-13>APTT4、HARIN、WIDE1*、WIDE21、qAO、WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k
:
-捕获组1“217”
-捕获组2“010”
-捕获组3“g015t047r000p000P025h76b10078”
"(?=[tphbcsLls#grPV])"
拆分正则表达式的解释:
-正向前瞻:(?=
-这些字符之一[tphbcsLls#grPV]
-结束正向前瞻)
[“g015”、“t047”、“r000”、“p000”、“P025”、“h76”、“b10078”]
match
来获取项目数组,而不是split
:
/[tphbcsLls#grPV][^tphbcsLls#grPV]*/g
match regex解释:
-扫描起始字母[tphbcsLls#grPV]
-抓取所有不是起始字母的字母[^tphbcsLls#grPV]*
- 用
global标志清洗并重复g
[“g015”、“t047”、“r000”、“p000”、“P025”、“h76”、“b10078”]
- 我的答案与彼得的答案相似。
首先将“数据”提取为一个长字符串,然后查找其中的所有子数据
我已经用java实现了它
import java.util.regex.Matcher;
导入java.util.regex.Pattern;
公共类So66537099{
公共静态void main(字符串[]args){
最终字符串[]行=(
“KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE21,qAO,WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k\n”+
"..."
).split(“\n”);
最终模式PATTERN1=Pattern.compile((?:[!=][0-9.NS]*/[0-9.EW]*.[0-9]{3})/([0-9]{3})((?:[tphbcsLl#grPV][0-9.]{2,5})**);
最终模式PATTERN2=Pattern.compile(“[tphbcsLl#grPV][0-9.]{2,5}”);
用于(最后一行字符串:行){
System.out.println(“line=“+line”);
最终匹配器m1=图案1.匹配器(线);
if(m1.matches()){
System.out.println(“匹配”);
系统输出println(“m1.组(1)=”+m1.组(1));
系统输出println(“m1.组(2)=”+m1.组(2));
最终字符串数据=m1.组(3);
System.out.println(“m1.组(3)=”+数据);
如果(!data.isEmpty()){
最终匹配器m2=模式2.匹配器(数据);
while(m2.find()){
System.out.println(“…m2.group()=”+m2.group());
}
}
}否则{
System.out.println(“不匹配”);
}
}
System.out.println();
}
}
输出:
line = KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE21,qAO,WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k
matches
m1.group(1) = 217
m1.group(2) = 010
m1.group(3) = g015t047r000p000P025h76b10078
... m2.group() = g015
... m2.group() = t047
... m2.group() = r000
... m2.group() = p000
... m2.group() = P025
... m2.group() = h76
... m2.group() = b10078
...
以下是我根据Peter Thoeney的意见得出的结论
// gcc -Wall -std=c99 -o RME RME.c && ./RME
// Source: https://gist.github.com/ianmackinnon/3294587
#include <stdio.h>
#include <string.h>
#include <regex.h>
#define NUMBER_OF_GROUPS 4 //groups in your regex + 1
#define NUMBER_OF_WX_GROUPS 14
char WXsource[64];
char source[] = "KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE2-1,qAO,WEBER"
":!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k";
char *regexString1 = ":[!=][0-9.NS]*.[0-9.EW]*_([0-9]{3})/([0-9]{3})([tphbcsLls#grPV0-9 .]+).{4}$";
char *regexString2 = "([tphbcsLls#grPV][^tphbcsLls#grPV]*)";
char WXDataArray[NUMBER_OF_WX_GROUPS][16];
int numberDecodedGroups=0;
int sourceOffset=0;
int start;
int count;
regex_t regexCompiled1;
regex_t regexCompiled2;
regmatch_t groupArray[NUMBER_OF_GROUPS];
int main ()
{
//the first regex matches the coords, wind data,
//the entire string of weather data, and weather station ID.
//It captures the wind data and weather data.
if (regcomp(®exCompiled1, regexString1, REG_EXTENDED|REG_NEWLINE))
{
printf("Could not compile regular expression 1.\n");
return(1);
}
//The second regex parses the weather data into the individual
//items. It requires multiple calls to accomplish the process.
if (regcomp(®exCompiled2, regexString2, REG_EXTENDED|REG_NEWLINE))
{
printf("Could not compile regular expression 2.\n");
return(1);
}
//first extraction. The weather data is in group 3.
regexec(®exCompiled1, source, NUMBER_OF_GROUPS, groupArray, 0);
start = groupArray[3].rm_so; //start of weather data
count = groupArray[3].rm_eo-start; //bytes of weather data
//create a null terminated string of the weather data
memcpy(&WXsource[0], &source[start], count);
WXsource[count]=0;
//this loop iterates for each entry in the weather data. With each loop
//the starting point is incremented by the length of the data just
//extracted. Each string is null terminated in an array.
//the regex looks for a character of one the field identifiers followed
//by as many characters it can grab that are NOT field identifiers.
for(int matchIndex=0; matchIndex < NUMBER_OF_WX_GROUPS; matchIndex++)
{
//find the data item. sourceOffset moves the beginning of the source
//string by the size of the previous extracted data item.
if (regexec(®exCompiled2, &WXsource[sourceOffset], NUMBER_OF_GROUPS, groupArray, 0))
break;
//start of entry. SHould always be 0
start = groupArray[1].rm_so;
//eo ends up being the count.
count = groupArray[1].rm_eo;
//copy the sub-string to the output array
memcpy(&WXDataArray[matchIndex][0], &WXsource[sourceOffset], count);
//add the null termination
WXDataArray[matchIndex][count]=0;
//increment sourceOffset
sourceOffset += groupArray[1].rm_eo;
//increment the number of fields extracted
numberDecodedGroups++;
}
for(int Index = 0; Index < numberDecodedGroups; Index++)
printf("%s\n", &WXDataArray[Index][0]);
return(0);
}
//gcc-Wall-std=c99-o RME RME.c&&./RME
//资料来源:https://gist.github.com/ianmackinnon/3294587
#包括
#包括
#包括
#在正则表达式+1中定义\u组的数量\u 4//组
#定义组的数量14
char WXsource[64];
char source[]=“KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE2-1,qAO,WEBER”
“:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k”;
char*regexString1=“:[!=][0-9.NS].[0-9.EW].[0-9]{3}/([0-9]{3})([tphbcsLls}grPV0-9.]+).{4}$;
char*regexString2=“([tphbcsLls#grPV][^tphbcsLls#grPV]*)”;
char WXDataArray[组的数量][16];
int numberDecodedGroups=0;
int sourceOffset=0;
int启动;
整数计数;
regex_t regexCompiled1;
regex_t regexCompiled2;
regmatch_t groupArray[组数];
int main()
{
//第一个正则表达式匹配坐标、风数据、,
//整个天气数据字符串和气象站ID。
//它捕获风数据和天气数据。
if(regcomp(®exCompiled1、regexString1、REG_EXTENDED、REG_NEWLINE))
{
printf(“无法编译正则表达式1。\n”);
申报表(1);
}
//第二个正则表达式将天气数据解析为单个
//需要多次调用才能完成此过程。
if(regcomp(®exCompiled2、regexString2、REG_EXTENDED | REG_NEWLINE))
{
printf(“无法编译正则表达式2。\n”);
申报表(1);
}
//第一次提取。天气数据在第3组中。
regexec(®exCompiled1,源,组数,组