C# 在多个文件中搜索文本的最快方法？_C#_Regex_String_Richtextbox

C# 在多个文件中搜索文本的最快方法？

c# regex string

C# 在多个文件中搜索文本的最快方法？,c#,regex,string,richtextbox,C#,Regex,String,Richtextbox,我需要在大约120个文本文件中找到一些文本，我想知道哪一个是搜索文本的最好和最快的方法。我应该读取RichTextBox中的每个文件，然后使用其方法搜索文本，还是应该将这些文件读入字符串变量，然后使用正则表达式搜索我认为表现背后的主要因素是找到一种方法，这样就没有必要在已经测试过的赛道上循环。有没有办法一次找到文件中的所有匹配项？有没有人像VisualStudio那样知道在文本文件中查找匹配项的方法？它在大约800-1000毫秒内搜索了200个文本文件以寻找匹配。我认为它使用多个线程来完成这一

我需要在大约120个文本文件中找到一些文本，我想知道哪一个是搜索文本的最好和最快的方法。我应该读取RichTextBox中的每个文件，然后使用其方法搜索文本，还是应该将这些文件读入字符串变量，然后使用正则表达式搜索

我认为表现背后的主要因素是找到一种方法，这样就没有必要在已经测试过的赛道上循环。有没有办法一次找到文件中的所有匹配项？有没有人像VisualStudio那样知道在文本文件中查找匹配项的方法？它在大约800-1000毫秒内搜索了200个文本文件以寻找匹配。我认为它使用多个线程来完成这一任务。

如果我告诉您：

1-我将加载所有文件路径到字符串列表

2-我将创建一个新列表来存储与我的搜索词匹配的文件路径

3-我将在文件列表中循环foreach并搜索我的术语，然后将匹配的文件添加到新列表中

string searchTerm = "Some terms";
    string[] MyFilesList = Directory.GetFiles(@"c:\txtDirPath\", "*.txt");
    List<string> FoundedSearch=new List<string>();
    foreach (string filename in MyFilesList)
    {
        string textFile = File.ReadAllText(filename);
        if (textFile.Contains(searchTerm))
        {
            FoundedSearch.Add(filename);
        }
    }

string searchTerm=“一些术语”；
字符串[]MyFilesList=Directory.GetFiles（@“c:\txtDirPath\”，“*.txt”）；
List FoundedSearch=新建列表（）；
foreach（MyFilesList中的字符串文件名）
{
string textFile=File.ReadAllText（文件名）；
if（textFile.Contains（searchTerm））
{
FoundedSearch.Add（文件名）；
}
}

然后，您可以处理以下列表：FoundedSearch您想要什么

顺便说一下：

我不知道最好的答案，但性能将非常好，直到800个文本文件，每个文件1000字

您可以使用

找到非常好的性能。我假设您需要在每个文件中搜索相同的字符串。每次搜索都可以使用

编译的正则表达式
string searchTerm = "searchWord";
Regex rx = new Regex(String.Format("\b{0}\b", searchTerm), RegexOptions.Compiled);
List<string> filePaths = new List<string>();

foreach (string filePath in filePaths)
{
   string allText = File.ReadAllText(filePath);
   var matches = rx.Matches(allText);             
   //rest of code
}

string searchTerm=“searchWord”；
Regex rx=newregex（String.Format（“\b{0}\b”，searchTerm），RegexOptions.Compiled）；
列表文件路径=新列表（）；
foreach（文件路径中的字符串文件路径）
{
string allText=File.ReadAllText（文件路径）；
var matches=rx.matches（所有文本）；
//代码的其余部分
}

您必须对性能进行基准测试，但我认为主要的瓶颈将是从磁盘打开和读取文件。你可以调查一下情况是否如此。或者，根据您最终要做的事情，像Lucene.Net这样的专用文本搜索程序（如评论中提到的I4V）可能更合适。
根据您的描述（120个文件，70K-80K个单词，每个文件1-2MB），似乎最好的方法是读取一次文件并建立一个可以搜索的索引。我在下面举了一个例子来说明如何实现这一点，但是如果您需要更复杂的搜索词匹配，而不是查找精确的词或前缀词，那么这可能对您的用处有限
如果您需要更复杂的文本搜索匹配（同时获得良好的性能），我建议您查看专门为此目的构建的优秀Lucene库
public struct WordLocation
{
    public WordLocation(string fileName, int lineNumber, int wordIndex)
    {
        FileName = fileName;
        LineNumber = lineNumber;
        WordIndex = wordIndex;
    }
    public readonly string FileName; // file containing the word.
    public readonly int LineNumber;  // line within the file.
    public readonly int WordIndex;   // index within the line.
}

public struct WordOccurrences
{
    private WordOccurrences(int nOccurrences, WordLocation[] locations)
    {
        NumberOfOccurrences = nOccurrences;
        Locations = locations;
    }

    public static readonly WordOccurrences None = new WordOccurrences(0, new WordLocation[0]);

    public static WordOccurrences FirstOccurrence(string fileName, int lineNumber, int wordIndex)
    {
        return new WordOccurrences(1, new [] { new WordLocation(fileName, lineNumber, wordIndex) });
    }

    public WordOccurances AddOccurrence(string fileName, int lineNumber, int wordIndex)
    {
        return new WordOccurrences(
            NumberOfOccurrences + 1, 
            Locations
                .Concat(
                    new [] { new WordLocation(fileName, lineNumber, wordIndex) })
                .ToArray());
    }

    public readonly int NumberOfOccurrences;
    public readonly WordLocation[] Locations;
}

public interface IWordIndexBuilder
{
    void AddWordOccurrence(string word, string fileName, int lineNumber, int wordIndex);
    IWordIndex Build();
}

public interface IWordIndex
{
    WordOccurrences Find(string word);
}

public static class BuilderExtensions
{
    public static IWordIndex BuildIndexFromFiles(this IWordIndexBuilder builder, IEnumerable<FileInfo> wordFiles)
    {
        var wordSeparators = new char[] {',', ' ', '\t', ';' /* etc */ };
        foreach (var file in wordFiles)
        {
            var lineNumber = 1;
            using (var reader = file.OpenText())
            {
                while (!reader.EndOfStream)
                {
                    var words = reader
                         .ReadLine() 
                         .Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries)
                         .Select(f => f.Trim());

                    var wordIndex = 1;
                    foreach (var word in words)
                        builder.AddWordOccurrence(word, file.FullName, lineNumber, wordIndex++);

                    lineNumber++;
                }
            }
        }
        return builder.Build();
    }
}

public struct WordLocation
{
公共字位置（字符串文件名、整数行号、整数字索引）
{
FileName=文件名；
LineNumber=行号；
WordIndex=WordIndex；
}
public readonly string FileName；//包含单词的文件。
public readonly int LineNumber；//文件中的行。
public readonly int-WordIndex；//行内的索引。
}
公共结构
{
私有字引用（int-nOccurrences，WordLocation[]位置）
{
numberofoccurrencess=nocurrences；
地点=地点；
}
public static readonly wordoccurrencess None=新的wordoccurrencess（0，新的WordLocation[0]）；
公共静态字引用首次出现（字符串文件名、整数行号、整数字索引）
{
返回新的wordoccurrencess（1，new[]{newwordlocation（fileName，lineNumber，wordIndex）}）；
}
public wordoccurrences AddOccurrence（字符串文件名、整数行号、整数字索引）
{
返回新单词(
发生次数+1，
位置
康卡特先生(
new[]{new WordLocation（文件名、行号、wordIndex）}）
.ToArray（））；
}
公共只读整数；
公共只读WordLocation[]位置；
}
公共接口IWordIndexBuilder
{
void AddWordOccurrence（字符串字、字符串文件名、整数行号、整数字索引）；
IWordIndex Build（）；
}
公共接口IWordIndex
{
单词查找（字符串单词）；
}
公共静态类BuilderExtensions
{
公共静态IWordIndex BuildIndexFromFiles（此IWordIndexBuilder生成器，IEnumerable wordFiles）
{
var wordSeparators=new char[]{'，'，''''.''.\t'，'；'/*etc*/}；
foreach（wordFiles中的var文件）
{
变量lineNumber=1；
使用（var reader=file.OpenText（））
{
而（！reader.EndOfStream）
{
var字=读卡器
.ReadLine（）
.Split（字分隔符、StringSplitOptions.RemoveEmptyEntries）
.选择（f=>f.Trim（））；
var-wordIndex=1；
foreach（单词中的var单词）
builder.AddWordOccurrence（word，file.FullName，lineNumber，wordIndex++）；
lineNumber++；
}
}
}
返回builder.Build（）；
}
}

然后，最简单的索引实现（只能进行精确匹配查找）在内部使用字典：
public class DictionaryIndexBuilder : IIndexBuilder
{
    private Dictionary<string, WordOccurrences> _dict;

    private class DictionaryIndex : IWordIndex 
    {
        private readonly Dictionary<string, WordOccurrences> _dict;

        public DictionaryIndex(Dictionary<string, WordOccurrences> dict)
        {
            _dict = dict;
        }
        public WordOccurrences Find(string word)
        {
           WordOccurrences found;
           if (_dict.TryGetValue(word, out found);
               return found;
           return WordOccurrences.None;
        }
    }

    public DictionaryIndexBuilder(IEqualityComparer<string> comparer)
    {
        _dict = new Dictionary<string, WordOccurrences>(comparer);
    }
    public void AddWordOccurrence(string word, string fileName, int lineNumber, int wordIndex)
    {
        WordOccurrences current;
        if (!_dict.TryGetValue(word, out current))
            _dict[word] = WordOccurrences.FirstOccurrence(fileName, lineNumber, wordIndex);
        else
            _dict[word] = current.AddOccurrence(fileName, lineNumber, wordIndex);
    }
    public IWordIndex Build()
    {
        var dict = _dict;
        _dict = null;
        return new DictionaryIndex(dict);
    }
}

公共类字典索引生成器：IIndexBuilder
{
私人词典；
私有类字典索引：IWordIndex
{
私人只读词典；
公共字典索引（字典目录）
{
var builder = new DictionaryIndexBuilder(EqualityComparer<string>.Default);
var index = builder.BuildIndexFromFiles(myListOfFiles);
var matchSocks = index.Find("Socks");