C 数据提取-这个正则表达式可以做得更好吗？_C_Regex_Gnu

C 数据提取-这个正则表达式可以做得更好吗？

c regex

C 数据提取-这个正则表达式可以做得更好吗？,c,regex,gnu,C,Regex,Gnu,我有一个C程序，正在解码来自APRSIS服务器的数据。它在GNU/LINUX机器上运行良好我创建了这个正则表达式来提取天气数据。很长。下面是一个示例数据记录和正则表达式：数据记录 KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE21,qAO,WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,N6LXX-10

我有一个C程序，正在解码来自APRSIS服务器的数据。它在GNU/LINUX机器上运行良好

我创建了这个正则表达式来提取天气数据。很长。下面是一个示例数据记录和正则表达式：

数据记录

KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE21,qAO,WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,N6LXX-10:!3410.50N/11828.90W_182/009g012t070P000h30b10220V126OTW1
KM6AHX-12>APOTU0,N6EX-5,qAR,N6LXX-10:!3411.20N/11813.02W_264/002g010t062p001h61T2WX
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_189/010g008t061p001h59T2WX
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,K6LOT-10:!3410.50N/11828.90W_127/008g014t070P000h30b10220V127OTW1
K6OUA-11>APOTW1,WA6ZSN,WIDE2,qAR,N6LXX-10:!3417.39N/11849.36W_225/003g005t066V133P000h45b10138OTW1
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_234/005g008t060p001h59T2WX
AD6NH>APJYC1,TCPIP*,qAC,T2CAWEST:=3352.28N/11749.75W_000/000t065h48b10206 /A=259 https://www.ka2ddo.org/ka2ddo/YAAC.html
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_170/004g013t060p001h60T2WX
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,N6LXX-10:!3410.50N/11828.90W_120/005g012t069P000h30b10220V127OTW1
K9COE-11>APOTW1,W6SCE-10,qAR,N6LXX-10:!3414.63N/11846.70W_105/007g007t065P035h51b10191OTW1
KM6AHX-12>APOTU0,N6EX-5*,qAR,K6LOT-10:!3411.20N/11813.02W_002/001g013t060p001h60T2WX
KM6AHX-12>APOTU0,N6EX-1*,qAR,VINCNT:!3411.20N/11813.02W_358/003g013t060p001h60T2WX
WA6MHA-11>APOTW1,WIDE1-1,WIDE2-1,qAR,K6LOT-10:!3410.50N/11828.90W_115/004g013t069P000h30b10220V126OTW1

Regex

":[!=][0-9.NS]*/[0-9.EW]*_([0-9]{3})/([0-9]{3})([tphbcsLls#grPV][0-9 .]{2,5})?"
    "([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
    "([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
    "([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
    "([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?"
    "([tphbcsLls#grPV][0-9 .]{2,5})?([tphbcsLls#grPV][0-9 .]{2,5})?.*$"

给定的数据记录可能不包含所有可能的数据类型（[tphbcsLls#grPV]），也不保证订单

有更好的方法吗？这似乎有点野蛮

Chuck Bland

您可以将解析分为两个步骤：

验证字符串，并将所有

x123

类型模式分组到单个捕获组中

将所有

x123

类型模式拆分为单独的捕获组

第1步：

":[!=][0-9.NS]*\/[0-9.EW]*_([0-9]{3})\/([0-9]{3})((?:[tphbcsLls#grPV][0-9 .]{2,5})+)"

正则表达式的解释：

```
：[！=][0-9.NS]*\/[0-9.EW]*.\uu
```
-期望模式选择正确的记录类型
```
（[0-9]{3}）
```
-捕获组1
```
\/
```
-斜杠
```
（[0-9]{3}）
```
-捕获组2
```
（
```
-捕获组3开始
- ```
（？：
```
  -非捕获组启动
  - ```
  [tphbcsLls#grPV][0-9.]{2,5}
```
  -预期模式（重复）
- ```
）+
```
  -非捕获组结束，重复此操作1+次
```
）
```
-捕获组3结束

输入的结果捕获组

KG7FOQ-13>APTT4、HARIN、WIDE1*、WIDE21、qAO、WEBER:！4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k

：

```
“217”
```
-捕获组1
```
“010”
```
-捕获组2
```
“g015t047r000p000P025h76b10078”
```
-捕获组3

步骤2:现在获取捕获组3的结果并将其拆分：

"(?=[tphbcsLls#grPV])"

拆分正则表达式的解释：

```
（？=
```
-正向前瞻：
```
[tphbcsLls#grPV]
```
-这些字符之一
```
）
```
-结束正向前瞻

拆分结果：

[“g015”、“t047”、“r000”、“p000”、“P025”、“h76”、“b10078”]

编辑：了解到正面外观不可用后：您可以使用带有全局标志的

match

来获取项目数组，而不是

split

：

/[tphbcsLls#grPV][^tphbcsLls#grPV]*/g

match regex解释：

```
[tphbcsLls#grPV]
```
-扫描起始字母
```
[^tphbcsLls#grPV]*
```
-抓取所有不是起始字母的字母
用
```
g
```
global标志清洗并重复

拆分结果：

[“g015”、“t047”、“r000”、“p000”、“P025”、“h76”、“b10078”]

import java.util.regex.Matcher；
导入java.util.regex.Pattern；
公共类So66537099{
公共静态void main（字符串[]args）{
最终字符串[]行=(
“KG7FOQ-13>APTT4，HARIN，WIDE1*，WIDE21，qAO，WEBER:！4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k\n”+
"..."
).split（“\n”）；
最终模式PATTERN1=Pattern.compile（（？：[！=][0-9.NS]*/[0-9.EW]*.[0-9]{3}）/（[0-9]{3}）（（？：[tphbcsLl#grPV][0-9.]{2,5}）**）；
最终模式PATTERN2=Pattern.compile（“[tphbcsLl#grPV][0-9.]{2,5}”）；
用于（最后一行字符串：行）{
System.out.println（“line=“+line”）；
最终匹配器m1=图案1.匹配器（线）；
if（m1.matches（））{
System.out.println（“匹配”）；
系统输出println（“m1.组（1）=”+m1.组（1））；
系统输出println（“m1.组（2）=”+m1.组（2））；
最终字符串数据=m1.组（3）；
System.out.println（“m1.组（3）=”+数据）；
如果（！data.isEmpty（））{
最终匹配器m2=模式2.匹配器（数据）；
while（m2.find（））{
System.out.println（“…m2.group（）=”+m2.group（））；
}
}
}否则{
System.out.println（“不匹配”）；
}
}
System.out.println（）；
}
}

line = KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE21,qAO,WEBER:!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k
matches
m1.group(1) = 217
m1.group(2) = 010
m1.group(3) = g015t047r000p000P025h76b10078
... m2.group() = g015
... m2.group() = t047
... m2.group() = r000
... m2.group() = p000
... m2.group() = P025
... m2.group() = h76
... m2.group() = b10078
...

// gcc -Wall -std=c99 -o RME RME.c && ./RME
// Source: https://gist.github.com/ianmackinnon/3294587

#include <stdio.h>
#include <string.h>
#include <regex.h>

#define    NUMBER_OF_GROUPS     4 //groups in your regex + 1
#define    NUMBER_OF_WX_GROUPS 14

char    WXsource[64];
char    source[] = "KG7FOQ-13>APTT4,HARIN,WIDE1*,WIDE2-1,qAO,WEBER"
                  ":!4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k";

char    *regexString1 = ":[!=][0-9.NS]*.[0-9.EW]*_([0-9]{3})/([0-9]{3})([tphbcsLls#grPV0-9 .]+).{4}$";
char    *regexString2 = "([tphbcsLls#grPV][^tphbcsLls#grPV]*)";
char    WXDataArray[NUMBER_OF_WX_GROUPS][16];
int     numberDecodedGroups=0;
int     sourceOffset=0;
int     start;
int     count;

regex_t     regexCompiled1;
regex_t     regexCompiled2;
regmatch_t  groupArray[NUMBER_OF_GROUPS];

int main ()
    {
    //the first regex matches the coords, wind data,
    //the entire string of weather data, and weather station ID.
    //It captures the wind data and weather data.
    if (regcomp(&regexCompiled1, regexString1, REG_EXTENDED|REG_NEWLINE))
        {
        printf("Could not compile regular expression 1.\n");
        return(1);
        }

    //The second regex parses the weather data into the individual
    //items. It requires multiple calls to accomplish the process.
    if (regcomp(&regexCompiled2, regexString2, REG_EXTENDED|REG_NEWLINE))
        {
        printf("Could not compile regular expression 2.\n");
        return(1);
        }

    //first extraction. The weather data is in group 3.
    regexec(&regexCompiled1, source, NUMBER_OF_GROUPS, groupArray, 0);

    start = groupArray[3].rm_so;                //start of weather data
    count = groupArray[3].rm_eo-start;          //bytes of weather data

    //create a null terminated string of the weather data
    memcpy(&WXsource[0], &source[start], count);
    WXsource[count]=0;

    //this loop iterates for each entry in the weather data. With each loop
    //the starting point is incremented by the length of the data just
    //extracted. Each string is null terminated in an array.
    //the regex looks for a character of one the field identifiers followed
    //by as many characters it can grab that are NOT field identifiers.
    for(int matchIndex=0; matchIndex < NUMBER_OF_WX_GROUPS; matchIndex++)
        {
        //find the data item. sourceOffset moves the beginning of the source
        //string by the size of the previous extracted data item.
        if (regexec(&regexCompiled2, &WXsource[sourceOffset], NUMBER_OF_GROUPS, groupArray, 0))
            break;

        //start of entry. SHould always be 0
        start = groupArray[1].rm_so;

        //eo ends up being the count.
        count = groupArray[1].rm_eo;

        //copy the sub-string to the output array
        memcpy(&WXDataArray[matchIndex][0], &WXsource[sourceOffset], count);

        //add the null termination
        WXDataArray[matchIndex][count]=0;

        //increment sourceOffset
        sourceOffset += groupArray[1].rm_eo;

        //increment the number of fields extracted
        numberDecodedGroups++;
        }

    for(int Index = 0; Index < numberDecodedGroups; Index++)
        printf("%s\n", &WXDataArray[Index][0]);

    return(0);
    }

//gcc-Wall-std=c99-o RME RME.c&&./RME
//资料来源：https://gist.github.com/ianmackinnon/3294587
#包括
#包括
#包括
#在正则表达式+1中定义\u组的数量\u 4//组
#定义组的数量14
char WXsource[64]；
char source[]=“KG7FOQ-13>APTT4，HARIN，WIDE1*，WIDE2-1，qAO，WEBER”
“：！4227.10N/11422.32W_217/010g015t047r000p000P025h76b10078TU2k”；
char*regexString1=“：[！=][0-9.NS].[0-9.EW].[0-9]{3}/（[0-9]{3}）（[tphbcsLls}grPV0-9.]+）.{4}$；
char*regexString2=“（[tphbcsLls#grPV][^tphbcsLls#grPV]*）”；
char WXDataArray[组的数量][16]；
int numberDecodedGroups=0；
int sourceOffset=0；
int启动；
整数计数；
regex_t regexCompiled1；
regex_t regexCompiled2；
regmatch_t groupArray[组数]；
int main（）
{
//第一个正则表达式匹配坐标、风数据、，
//整个天气数据字符串和气象站ID。
//它捕获风数据和天气数据。
if（regcomp（®exCompiled1、regexString1、REG_EXTENDED、REG_NEWLINE））
{
printf（“无法编译正则表达式1。\n”）；
申报表（1）；
}
//第二个正则表达式将天气数据解析为单个
//需要多次调用才能完成此过程。
if（regcomp（®exCompiled2、regexString2、REG_EXTENDED | REG_NEWLINE））
{
printf（“无法编译正则表达式2。\n”）；
申报表（1）；
}
//第一次提取。天气数据在第3组中。
regexec（®exCompiled1，源，组数，组