C# 在CSharp中查找与正则表达式匹配的文本的更好方法?
我希望显示与文本字符串匹配的正则表达式列表 我以日期为例。空格代表其他文本 FindMatchRegex遍历正则表达式列表。 因为我不知道文本在正则表达式中的匹配位置,所以我匹配正则表达式的每个子字符串。 因此,从整个字符串开始,我通过从前面切掉一个字符来逐渐减少正则表达式 我检查它是否是有效的正则表达式,然后使用PCRE正则表达式检查部分匹配或完全匹配。 如果是部分匹配或完全匹配,请将其添加到可能匹配的正则表达式列表中 在.NETFiddle上,对于200个长度的长字符串和200个正则表达式,这将在大约1秒内执行。 在我的桌面上,16GB,i5-3570K 3.4GHz,大约需要6秒钟 我正在寻找大约0.5秒的响应时间。如何使速度提高10倍或100倍 我缺少什么命令或技巧C# 在CSharp中查找与正则表达式匹配的文本的更好方法?,c#,regex,C#,Regex,我希望显示与文本字符串匹配的正则表达式列表 我以日期为例。空格代表其他文本 FindMatchRegex遍历正则表达式列表。 因为我不知道文本在正则表达式中的匹配位置,所以我匹配正则表达式的每个子字符串。 因此,从整个字符串开始,我通过从前面切掉一个字符来逐渐减少正则表达式 我检查它是否是有效的正则表达式,然后使用PCRE正则表达式检查部分匹配或完全匹配。 如果是部分匹配或完全匹配,请将其添加到可能匹配的正则表达式列表中 在.NETFiddle上,对于200个长度的长字符串和200个正则表达式,
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using PCRE;
using System.Diagnostics;
public class Program
{
public static void Main()
{
List<string> regexList = new();
string longString = new string (' ', 200);
regexList.Add(longString + @"(\d|\d\d) December (\d\d\d\d) 1066");
regexList.Add(longString + @"(\d|\d\d) December (\d\d\d\d) 1999");
regexList.Add(longString + @"(\d|\d\d) December (\d\d\d\d) 2000");
regexList.Add(longString + @"(\d|\d\d) December (\d\d\d\d) 2020");
for (int i = 0; i < 200; i++)
regexList.Add(longString + @"(\d|\d\d) April (\d\d\d\d)");
string checkString = "1 December 1234 10";
// string checkString = "1 December 4567 1";
// string checkString = "1 December 1234 20";
string checkString = "1 December 1234";
Stopwatch stopwatch = new();
stopwatch.Start();
List<string> result = FindMatchRegex(checkString, regexList);
stopwatch.Stop();
foreach (var item in result)
{
Console.WriteLine(checkString + " found to match " + item);
}
Console.WriteLine("Time elapsed: " + stopwatch.Elapsed);
}
private static List<string> FindMatchRegex(string filter, List<string> regexList)
{
List<string> matchingRegexes = new();
for (int i = 0; i < regexList.Count; i++)
{
string currentRegex = regexList[i];
bool anyMatches = false;
int j = 0;
while (j < currentRegex.Length && anyMatches == false)
{
string currentRegexSubstring = currentRegex.Substring(j);
if (IsValidRegex(currentRegexSubstring))
{
var regex = new PcreRegex("^" + currentRegexSubstring);
var match = regex.Match(filter, PcreMatchOptions.PartialSoft);
anyMatches = anyMatches || match.IsPartialMatch || match.Success;
}
j++;
}
if (anyMatches == true)
{
matchingRegexes.Add(currentRegex);
}
}
return matchingRegexes;
}
private static bool IsValidRegex(string pattern)
{
if (string.IsNullOrWhiteSpace(pattern))
return false;
try
{
Regex.Match("", pattern);
}
catch (ArgumentException)
{
return false;
}
return true;
}
}
使用系统;
使用System.Collections.Generic;
使用System.Text.RegularExpressions;
使用PCRE;
使用系统诊断;
公共课程
{
公共静态void Main()
{
List regexList=new();
字符串长字符串=新字符串(“”,200);
regexList.Add(longString+@“(\d\d\d)十二月(\d\d\d)1066”);
regexList.Add(longString+@)(\d\d\d)1999年12月(\d\d\d)日);
regexList.Add(longString+@)(\d\d\d)2000年12月(\d\d\d)日);
regexList.Add(longString+@)(\d\d\d)2020年12月(\d\d\d)日);
对于(int i=0;i<200;i++)
regexList.Add(longString+@“(\d\d\d)April(\d\d\d)”);
string checkString=“1234年12月1日10”;
//string checkString=“4567年12月1日1”;
//string checkString=“1234年12月1日20”;
string checkString=“1234年12月1日”;
秒表秒表=新的();
秒表。开始();
列表结果=FindMatchRegex(检查字符串,regexList);
秒表;
foreach(结果中的var项目)
{
Console.WriteLine(检查字符串+“发现匹配”+项);
}
Console.WriteLine(“经过的时间:+秒表.经过的时间”);
}
私有静态列表FindMatchRegex(字符串筛选器,列表regexList)
{
List matchingRegexes=new();
for(int i=0;i
编辑
节目目的
我正在写一个使用内部翻译的翻译程序。独特的句子匹配正确且容易,但为日期或产品项目描述的微小变化添加新的翻译会让人厌烦。因此,字典包含正则表达式,以匹配要翻译成语言的英语。非常适合在翻译过程中不发生变化的日期和产品项目
当用户想要更新翻译,而不是整个英语的类型时,他们可以只键入英语的一部分来隔离要更新的翻译。因此,我想从字典中筛选术语列表,为用户提供匹配术语的下拉列表
例如,如果我键入“2020年12月31日”,我需要一个与2020年12月31日匹配的所有英语术语的列表,但如果词典使用正则表达式“…(\d\d\d)December(\d\d\d)…”,则它在文本基础上不会匹配。我想扫描字典,以便所有带有regex“(\d\d\d\d)December(\d\d\d\d)”的英语术语也将匹配
我是不是用错误的方法来解决这个问题
编辑
要翻译的字符串示例
由于第18c节的缺陷通知,ABC部分已于2010年7月21日被XYZ部分替换
由于缺陷通知第17b节,零件DEF已于2009年7月15日替换为零件RST
由于第15a节的缺陷通知,零件DEF已于2008年7月15日被零件RST替换
正则表达式来翻译字符串,我现在有大约200个,预计会增加
零件([A-Z][A-Z][A-Z])已于7月(\d\d\d\d\d)日被零件([A-Z][A-Z][A-Z])替换,原因是缺陷通知部分(\d\d[A-Z])
翻译
由于缺陷注释第5节,语言部分$1语言已在$3语言7月$4语言被第2部分替换为$3语言
匹配字符串并翻译它们很好。如果在校对过程中,我们收到通知,“由于缺陷通知第18c节,ABC部分已于2010年7月21日被XYZ部分替换”是错误的,则用户可以键入“由于缺陷通知第18c节,ABC部分已被XYZ部分替换”,程序可以显示“由于缺陷通知第18c节,ABC部分已于7月(\d\d)日(\d\d\d\d)被[A-Z][A-Z][A-Z])A-Z]部分替换(\d\d[a-z])作为可能匹配的英语术语,然后去编辑翻译
有些时候,我们看到的文本有轻微的错误,或拼写错误或额外的标点符号
static void Main(string[] args)
{
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
// build your list of regex string, ideally reading them in from a file or getting them from a db
List<string> regexList = new List<string>();
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 1066 because of defect notice section (\d{1,2}[a-z])");
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 1999 because of defect notice section (\d{1,2}[a-z])");
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 2000 because of defect notice section (\d{1,2}[a-z])");
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 2020 because of defect notice section (\d{1,2}[a-z])");
for (int i = 0; i < 10; i++)
{
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) April (\d{4}) 2020 because of defect notice section (\d{1,2}[a-z])");
}
// if you aren't going to maintain a clean list, clean it now before we start testing
List<string> cleanRegexList = CleanRegexList(regexList);
string checkString = "1 July 2000 1"; // Expect 2 results
//string checkString = "1 July 2000 19"; // Expect 1 result
//string checkString = "1 July 2000 20"; // Expect 2 results
//string checkString = "1 July 2000 202"; // Expect 1 result
//string checkString = "1 July 2000"; // Expect 4 results
List<string> results = FindMatchRegex(checkString, cleanRegexList);
stopwatch.Stop();
foreach (string result in results)
{
Console.WriteLine(checkString + " found to match " + result);
}
Console.WriteLine("Time elapsed: " + stopwatch.Elapsed);
}
Process.ExtractTop(
"Part ABC has been replaced by part XYZ on 21 July 2010 because of defect notice section 18c",
regexList,
limit: 10);
(\d|\d\d) December (\d\d\d\d) 2020
(\d|\d\d) December (\d\d\d\d) 1066
(\d{1,2}) December (\d{4}) 1066
1 December 2020 found to match (\d{1,2}) December (\d{4})
Time elapsed: 00:00:00.0083255
public static void Main()
{
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
// build your list of regex string, ideally reading them in from a file or getting them from a db
List<string> regexList = new List<string>();
regexList.Add(@"(\d{1,2}) December (\d{4})");
for (int i = 0; i < 200; i++)
{
regexList.Add(@"(\d{1,2}) April (\d{4})");
}
// if you aren't going to maintain a clean list, clean it now before we start testing
List<string> cleanRegexList = CleanRegexList(regexList);
string checkString = "1 December 2020";
// string checkString = "15 April 2020";
List<string> results = FindMatchRegex(checkString, cleanRegexList);
stopwatch.Stop();
foreach (string result in results)
{
Console.WriteLine(checkString + " found to match " + result);
}
Console.WriteLine("Time elapsed: " + stopwatch.Elapsed);
}
private static List<string> FindMatchRegex(string checkString, List<string> regexList)
{
List<string> matchingRegexes = new List<string>();
foreach (string regexString in regexList)
{
Regex regex = new Regex(regexString);
if (regex.IsMatch(checkString))
{
matchingRegexes.Add(regexString);
}
}
return matchingRegexes;
}
private static List<string> CleanRegexList(List<string> regexList)
{
List<string> cleanRegexes = new List<string>();
foreach (string regexString in regexList)
{
if (IsValidRegex(regexString))
{
cleanRegexes.Add(regexString);
}
}
return cleanRegexes;
}
static void Main(string[] args)
{
Stopwatch stopwatch = new Stopwatch();
stopwatch.Start();
// build your list of regex string, ideally reading them in from a file or getting them from a db
List<string> regexList = new List<string>();
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 1066 because of defect notice section (\d{1,2}[a-z])");
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 1999 because of defect notice section (\d{1,2}[a-z])");
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 2000 because of defect notice section (\d{1,2}[a-z])");
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) July (\d{4}) 2020 because of defect notice section (\d{1,2}[a-z])");
for (int i = 0; i < 10; i++)
{
// Needs to be unique for dictionary
regexList.Add(@"Part ([A-Z]{3}) has been replaced by part ([A-Z]{3}) on (\d{1,2}) April (\d{4}) " + i.ToString() + @" because of defect notice section (\d{1,2}[a-z])");
}
// I'll store this somewhere
Dictionary<string, List<string>> regexDictionary = FindAllRegexStrings(regexList);
string checkString = "1 July 2000 1"; // Expect 2 results
//string checkString = "1 July 2000 19"; // Expect 1 result
//string checkString = "1 July 2000 20"; // Expect 2 results
//string checkString = "1 July 2000 202"; // Expect 1 result
//string checkString = "1 July 2000"; // Expect 4 results
//List<string> results = FindMatchRegex(checkString, cleanRegexList);
stopwatch.Stop();
Console.WriteLine("Setting up time elapsed: " + stopwatch.Elapsed);
stopwatch.Reset();
stopwatch.Start();
List<string> results = FindMatchRegex(checkString, regexDictionary);
stopwatch.Stop();
Console.WriteLine("Finding regex time elapsed: " + stopwatch.Elapsed);
foreach (string result in results)
{
Console.WriteLine(checkString + " found to match " + result);
}
}
private static List<string> FindMatchRegex(string checkString, Dictionary<string, List<string>> regexDictionary)
{
List<string> matchingRegexes = new();
foreach (var regexEntry in regexDictionary)
{
List<string> currentRegexes = regexEntry.Value;
bool anyMatches = false;
for (int j = 0; j < currentRegexes.Count && anyMatches == false; j++)
{
string currentRegexSubstring = currentRegexes[j];
var regex = new PcreRegex("^" + currentRegexSubstring);
var match = regex.Match(checkString, PcreMatchOptions.PartialSoft);
anyMatches = anyMatches || match.IsPartialMatch || match.Success;
}
if (anyMatches == true)
{
matchingRegexes.Add(regexEntry.Key);
}
}
return matchingRegexes;
}
private static Dictionary<string, List<string>> FindAllRegexStrings(List<string> regexList)
{
Dictionary<string, List<string>> regexDictionary = new();
for (int i = 0; i < regexList.Count; i++)
{
regexDictionary.Add(regexList[i], ValidRegexes(regexList[i]));
}
return regexDictionary;
}
private static List<string> ValidRegexes(string regexString)
{
List<string> listRegexes = new();
for (int i = 0; i < regexString.Length; i++)
{
string currentRegexSubstring = regexString.Substring(i);
if (IsValidRegex(currentRegexSubstring))
{
listRegexes.Add(currentRegexSubstring);
}
}
return listRegexes;
}
private static bool IsValidRegex(string pattern)
{
if (string.IsNullOrWhiteSpace(pattern))
return false;
try
{
Regex.Match("", pattern);
}
catch (ArgumentException)
{
return false;
}
return true;
}