C# 在C中使用正则表达式解析文本文件的多个部分#

C# 在C中使用正则表达式解析文本文件的多个部分#,c#,regex,C#,Regex,我想解析一个包含如下内容的文本文件: START-OF-DATA #100846105 START SECURITY|US912810DZ85|CBBT| ## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)] ## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)] 04/30|15:00:00|B|118.640625||| |A|118.70

我想解析一个包含如下内容的文本文件:

START-OF-DATA
#100846105
START SECURITY|US912810DZ85|CBBT|
## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
04/30|15:00:00|B|118.640625||| |A|118.703125||| ||
04/30|14:59:54|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:52|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:23|B|118.6328125||| |A|118.6953125||| ||
04/30|14:59:20|B|118.6328125||| |A|118.6953125||| ||
END SECURITY|US912810DZ85|0|
#100846111
START SECURITY|US912810EA26|CBBT|
## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]
04/30|15:00:00|B|124.75||| |A|124.828125||| ||
04/30|14:59:55|B|124.75||| |A|124.8203125||| ||
04/30|14:59:53|B|124.7421875||| |A|124.8203125||| ||
04/30|14:59:45|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:43|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:27|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:24|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:22|B|124.7421875||| |A|124.8125||| ||
04/30|14:59:20|B|124.7421875||| |A|124.828125||| ||
04/30|14:59:13|B|124.7421875||| |A|124.8125||| ||
END SECURITY|US912810EA26|0|
END-OF-DATA
使用下面的代码

string pattern = @"^(START-OF-DATA\r\n)(?<InstrumentsSection>[^\\]*?)(?:(^END-OF-DATA))";
var expressionMatchColl = regex.Matches(File.ReadAllText(filePath));
            foreach (Match match in expressionMatchColl)
{
                            string[] instrumentRows = match.Groups["InstrumentsSection"].Value.Split(new string[] { Environment.NewLine }, StringSplitOptions.RemoveEmptyEntries);
                            instruments = instrumentRows.ToList();
            }
string pattern=@“^(数据开始\r\n)(?[^\\]*?)(?:(^END-OF-DATA))”;
var expressionMatchColl=regex.Matches(File.ReadAllText(filePath));
foreach(在表达式MatchColl中匹配)
{
string[]instrumentRows=match.Groups[“InstrumentsSection”].Value.Split(新字符串[]{Environment.NewLine},StringSplitOptions.RemoveEmptyEntries);
instruments=instrumentRows.ToList();
}
我能够检索数据开始部分和数据结束部分中的每一行。但是,要忽略以START SECURITY开头的行,# 结束安全。此外,希望将勾号值和标识符(例如100846105、100846111)分组


有人能给你建议吗?

你可以逐行阅读文件,过滤掉你不需要的行。此外,可以将记号值和ID收集到字符串列表中

示例代码:

var res = string.Empty;
var ids = new List<string>();
using (var sr = new StreamReader(filepath, true))
{
    var s = "";
    while ((s = sr.ReadLine()) != null)
    {
       if (s.StartsWith("START-OF-DATA"))
       {
           while (!s.StartsWith("END-OF-DATA"))
           {
              if ( !s.StartsWith("START SECURITY") &&
                   !s.StartsWith("##") &&
                   !s.StartsWith("END SECURITY"))
                   {
                      res += s + System.Environment.NewLine;
                   }
              if (s.StartsWith("#") && !s.StartsWith("##"))
                   ids.Add(s);
               s = sr.ReadLine();
            }
            res += s;
       }
   }
}

然后,如果要读取多个块,只需创建一个字符串列表来存储
res
s,并在
res+=s之后添加

这里是一个简单的解析器

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;


namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            List<Section> sections = new List<Section>();
            string input =
               "START-OF-DATA\n" +
               "#100846105\n" +
               "START SECURITY|US912810DZ85|CBBT|\n" +
               "## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "04/30|15:00:00|B|118.640625||| |A|118.703125||| ||\n" +
               "04/30|14:59:54|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:52|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:23|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:20|B|118.6328125||| |A|118.6953125||| ||\n" +
               "END SECURITY|US912810DZ85|0|\n" +
               "#100846111\n" +
               "START SECURITY|US912810EA26|CBBT|\n" +
               "## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "04/30|15:00:00|B|124.75||| |A|124.828125||| ||\n" +
               "04/30|14:59:55|B|124.75||| |A|124.8203125||| ||\n" +
               "04/30|14:59:53|B|124.7421875||| |A|124.8203125||| ||\n" +
               "04/30|14:59:45|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:43|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:27|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:24|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:22|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:20|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:13|B|124.7421875||| |A|124.8125||| ||\n" +
               "END SECURITY|US912810EA26|0|\n" +
               "END-OF-DATA\n";


            StringReader reader = new StringReader(input);
            string inputLine = "";
            Section newSection = null;
            while ((inputLine = reader.ReadLine()) != null)
            {
                inputLine = inputLine.Trim();
                if (inputLine.StartsWith("#"))
                {
                    if (inputLine.Contains("in:")) continue;
                    if (inputLine.Contains("out:")) continue;
                    newSection = new Section();
                    sections.Add(newSection);
                    newSection.iD = inputLine.Substring(1);
                    newSection.data = new List<string>();

                }
                else
                {
                    if (inputLine.Substring(0, 3) == "END") continue;
                    if (inputLine.Substring(0, 5) == "START") continue;
                    newSection.data.Add(inputLine);
                }
            }

        }
        public class Section
        {
            public string iD { get; set; }
            public List<string> data { get; set; }
        }
    }

}
使用系统;
使用System.Collections.Generic;
使用System.Linq;
使用系统文本;
使用System.IO;
命名空间控制台应用程序1
{
班级计划
{
静态void Main(字符串[]参数)
{
列表部分=新列表();
字符串输入=
“起始数据\n”+
“#100846105\n”+
“启动安全性| US912810DZ85 | CBBT | \n”+
“##in:20150430_14:59:00至20150430_15:00:00[13(纽约夏令时)]\n”+
“##out:20150430_14:59:00至20150430_15:00:00[13(纽约夏令时)]\n”+
“04/30 | 15:00:00 | B | 118.640625 | | | | | | | | A | 118.703125 | | | | \n”+
“04/30 | 14:59:54 | B | 118.6328125 | | | | | | A | 118.6953125 | | | | | \n”+
“04/30 | 14:59:52 | B | 118.6328125 | | | | | | | A | 118.6953125 | | | | | \n”+
“04/30 | 14:59:23 | B | 118.6328125 | | | | | | A | 118.6953125 | | | | | \n”+
“04/30 | 14:59:20 | B | 118.6328125 | | | | | | A | 118.6953125 | | | | | \n”+
“终端安全性| US912810DZ85 | 0 | \n”+
“#100846111\n”+
“启动安全性| US912810EA26 | CBBT | \n”+
“##in:20150430_14:59:00至20150430_15:00:00[13(纽约夏令时)]\n”+
“##out:20150430_14:59:00至20150430_15:00:00[13(纽约夏令时)]\n”+
“04/30 | 15:00:00 | B | 124.75 | | | | | A | 124.828125 | | | | | \n”+
“04/30 | 14:59:55 | B | 124.75 | | | | | | | | A | 124.8203125 | | | | \n”+
“04/30 | 14:59:53 | B | 124.7421875 | | | | | | A | 124.8203125 | | | | | \n”+
“04/30 | 14:59:45 | B | 124.7421875 | | | | | A | 124.8125 | | | | \n”+
“04/30 | 14:59:43 | B | 124.7421875 | | | | | | | A | 124.828125 | | | | | \n”+
“04/30 | 14:59:27 | B | 124.7421875 | | | | | A | 124.8125 | | | | \n”+
“04/30 14:59:24 | B | 124.7421875 | | | | | | | | A | 124.828125 | | | | | \n”+
“04/30 | 14:59:22 | B | 124.7421875 | | | | | A | 124.8125 | | | | \n”+
“04/30 14:59:20 | B | 124.7421875 | | | | | | | | A | 124.828125 | | | | | \n”+
“04/30 | 14:59:13 | B | 124.7421875 | | | | | A | 124.8125 | | | | \n”+
“终端安全性| US912810EA26 | 0 | \n”+
“数据结束\n”;
StringReader=新的StringReader(输入);
字符串inputLine=“”;
节newSection=null;
而((inputLine=reader.ReadLine())!=null)
{
inputLine=inputLine.Trim();
if(inputLine.StartsWith(“#”)
{
如果(inputLine.Contains(“in:”)继续;
如果(inputLine.Contains(“out:”)继续;
newSection=新节();
节。添加(新闻节);
newSection.iD=inputLine.Substring(1);
newSection.data=新列表();
}
其他的
{
如果(inputLine.Substring(0,3)=“END”)继续;
如果(inputLine.Substring(0,5)=“START”)继续;
newSection.data.Add(inputLine);
}
}
}
公共课组
{
公共字符串iD{get;set;}
公共列表数据{get;set;}
}
}
}

正则表达式解决方案将不会有效。你是否考虑逐行阅读?即使你可以创建一个单一的正则表达式来解析这个文件,但是它会非常复杂,因为每个部分都需要自己的解析。您可以创建自己的解析器,解析每一行。不过,更好的解决方案是使用像ANTLR这样的解析器生成器。例如,检查syntaxt以解析带有标题和字段的文件。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;


namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            List<Section> sections = new List<Section>();
            string input =
               "START-OF-DATA\n" +
               "#100846105\n" +
               "START SECURITY|US912810DZ85|CBBT|\n" +
               "## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "04/30|15:00:00|B|118.640625||| |A|118.703125||| ||\n" +
               "04/30|14:59:54|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:52|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:23|B|118.6328125||| |A|118.6953125||| ||\n" +
               "04/30|14:59:20|B|118.6328125||| |A|118.6953125||| ||\n" +
               "END SECURITY|US912810DZ85|0|\n" +
               "#100846111\n" +
               "START SECURITY|US912810EA26|CBBT|\n" +
               "## in: 20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "## out:20150430_14:59:00 to 20150430_15:00:00 [13 (New York-DST)]\n" +
               "04/30|15:00:00|B|124.75||| |A|124.828125||| ||\n" +
               "04/30|14:59:55|B|124.75||| |A|124.8203125||| ||\n" +
               "04/30|14:59:53|B|124.7421875||| |A|124.8203125||| ||\n" +
               "04/30|14:59:45|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:43|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:27|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:24|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:22|B|124.7421875||| |A|124.8125||| ||\n" +
               "04/30|14:59:20|B|124.7421875||| |A|124.828125||| ||\n" +
               "04/30|14:59:13|B|124.7421875||| |A|124.8125||| ||\n" +
               "END SECURITY|US912810EA26|0|\n" +
               "END-OF-DATA\n";


            StringReader reader = new StringReader(input);
            string inputLine = "";
            Section newSection = null;
            while ((inputLine = reader.ReadLine()) != null)
            {
                inputLine = inputLine.Trim();
                if (inputLine.StartsWith("#"))
                {
                    if (inputLine.Contains("in:")) continue;
                    if (inputLine.Contains("out:")) continue;
                    newSection = new Section();
                    sections.Add(newSection);
                    newSection.iD = inputLine.Substring(1);
                    newSection.data = new List<string>();

                }
                else
                {
                    if (inputLine.Substring(0, 3) == "END") continue;
                    if (inputLine.Substring(0, 5) == "START") continue;
                    newSection.data.Add(inputLine);
                }
            }

        }
        public class Section
        {
            public string iD { get; set; }
            public List<string> data { get; set; }
        }
    }

}