C# 替换大文本文件中的长列表单词_C#_String_List_Replace_Text Processing

C# 替换大文本文件中的长列表单词

c# string list replace

C# 替换大文本文件中的长列表单词,c#,string,list,replace,text-processing,C#,String,List,Replace,Text Processing,我需要一个快速的方法来处理大文本文件我有两个文件，一个大的文本文件（~20Gb）和另一个包含1200万个组合词列表的文本文件我想在第一个文本文件中找到所有组合词，并用另一个组合词（带下划线的组合词）替换它示例“计算机信息”>替换为>“计算机信息” 我使用这段代码，但性能非常差（我在带有16Gb Ram和16核的Hp G7服务器上测试）公共部分类表单1:表单 { HashSet wordlist=新HashSet（）；私有void loadComboWords（） { 使用（Stre

我需要一个快速的方法来处理大文本文件

我有两个文件，一个大的文本文件（~20Gb）和另一个包含1200万个组合词列表的文本文件

我想在第一个文本文件中找到所有组合词，并用另一个组合词（带下划线的组合词）替换它

示例“计算机信息”>替换为>“计算机信息”

我使用这段代码，但性能非常差（我在带有16Gb Ram和16核的Hp G7服务器上测试）

公共部分类表单1:表单
{
HashSet wordlist=新HashSet（）；
私有void loadComboWords（）
{
使用（StreamReader ff=newstreamreader（txtComboWords.Text））
{
弦线；
而（（line=ff.ReadLine（））！=null）
{
添加（行）；
}
}
}
专用void替换字（参考字符串str）
{
foreach（单词列表中的字符串wd）
{
//replaceX（参考str，wd，wd.Replace（“，”）；
if（str.IndexOf（wd）>-1）
str.Replace（wd，wd.Replace（“，”）；
}
}
私有无效按钮3\u单击（对象发送者，事件参数e）
{
弦线；
使用（StreamReader fread=newstreamreader（txtFirstFile.Text））
{
字符串writefile=Path.GetFullPath（txtFirstFile.Text）+Path.GetFileNameWithoutExtension（txtFirstFile.Text）+“\u ReplaceComboWords.txt”；
StreamWriter sw=新的StreamWriter（writefile）；
长期投资百分比；
标签3.Text=“初始化”；
loadComboWords（）；
而（（line=fread.ReadLine（））！=null）
{
替换字（参考行）；
西南写入线（行）；
intPercent=（fread.BaseStream.Position*100）/fread.BaseStream.Length；
Application.DoEvents（）；
label3.Text=intPercent.ToString（）；
}
sw.Close（）；
fread.Close（）；
label3.Text=“完成”；
}
}
}

在合理的时间内做这项工作有什么想法吗

谢谢

乍一看，您所采取的方法看起来很好-它应该可以正常工作，并且没有任何明显的原因会导致大量垃圾收集

我认为主要的一点是，您将只使用这16个内核中的一个：没有任何东西可以在其他15个内核之间共享负载

我认为最简单的方法是将20Gb的大文件分成16个块，然后分析每个块，然后再将这些块合并在一起。与一起扫描这16个数据块所获得的~16倍收益相比，拆分和重新组装文件所需的额外时间应该是最小的

概括地说，实现这一点的一种方法可能是：

    private List<string> SplitFileIntoChunks(string baseFile)
    {
        // Split the file into chunks, and return a list of the filenames.
    }

    private void AnalyseChunk(string filename)
    {
        // Analyses the file and performs replacements, 
        // perhaps writing to the same filename with a different
        // file extension
    }

    private void CreateOutputFileFromChunks(string outputFile, List<string> splitFileNames)
    {
        // Combines the rewritten chunks created by AnalyseChunk back into
        // one large file, outputFile.
    }

    public void AnalyseFile(string inputFile, string outputFile)
    {
        List<string> splitFileNames = SplitFileIntoChunks(inputFile);

        var tasks = new List<Task>();
        foreach (string chunkName in splitFileNames)
        {
            var task = Task.Factory.StartNew(() => AnalyseChunk(chunkName));
            tasks.Add(task);
        }

        Task.WaitAll(tasks.ToArray());

        CreateOutputFileFromChunks(outputFile, splitFileNames);
    }

私有列表将文件拆分为块（字符串基文件）
{
//将文件拆分为块，并返回文件名列表。
}
私有void分析chunk（字符串文件名）
{
//分析文件并执行替换，
//可能是用不同的文件名写入同一文件名
//文件扩展名
}
私有void CreateOutputFileFromChunks（字符串输出文件，列表拆分文件名）
{
//将AnalyseChunk创建的重写块合并回
//一个大文件，outputFile。
}
公共无效分析文件（字符串输入文件、字符串输出文件）
{
List splitfilename=splitfileintockunks（inputFile）；
var tasks=新列表（）；
foreach（SplitFileName中的字符串chunkName）
{
var task=task.Factory.StartNew（（）=>AnalyseChunk（chunkName））；
任务。添加（任务）；
}
Task.WaitAll（tasks.ToArray（））；
CreateOutputFileFromChunks（outputFile，SplitFileName）；
}

一个小问题：将流长度的计算移出循环，您只需要得到一次

编辑：此外，还包括@Pavel Gatilov的想法，即反转内部循环的逻辑，并在1200万列表中搜索行中的每个单词

有几个想法：

我认为把每一行分成几个单词，看看是否每个单词都出现在你的单词列表中会更有效。在一个哈希集中进行10次查找比在一个子字符串中进行数百万次搜索要好。如果您有复合关键字，请创建适当的索引：一个包含真实关键字中出现的所有单个单词，另一个包含所有真实关键字

也许，将字符串加载到

StringBuilder

更适合替换

在处理10000行之后更新进度，而不是在每行之后

后台线程中的进程。这不会使它更快，但应用程序将负责

正如Jeremy所建议的那样，并行化代码

更新

下面是一个示例代码，演示了按单词索引的思想：

static void ReplaceWords()
{
  string inputFileName = null;
  string outputFileName = null;

  // this dictionary maps each single word that can be found
  // in any keyphrase to a list of the keyphrases that contain it.
  IDictionary<string, IList<string>> singleWordMap = null;

  using (var source = new StreamReader(inputFileName))
  {
    using (var target = new StreamWriter(outputFileName))
    {
      string line;
      while ((line = source.ReadLine()) != null)
      {
        // first, we split each line into a single word - a unit of search
        var singleWords = SplitIntoWords(line);

        var result = new StringBuilder(line);
        // for each single word in the line
        foreach (var singleWord in singleWords)
        {
          // check if the word exists in any keyphrase we should replace
          // and if so, get the list of the related original keyphrases
          IList<string> interestingKeyPhrases;
          if (!singleWordMap.TryGetValue(singleWord, out interestingKeyPhrases))
            continue;

          Debug.Assert(interestingKeyPhrases != null && interestingKeyPhrases.Count > 0);

          // then process each of the keyphrases
          foreach (var interestingKeyphrase in interestingKeyPhrases)
          {
            // and replace it in the processed line if it exists
            result.Replace(interestingKeyphrase, GetTargetValue(interestingKeyphrase));
          }
        }

        // now, save the processed line
        target.WriteLine(result);
      }
    }
  }
}

private static string GetTargetValue(string interestingKeyword)
{
  throw new NotImplementedException();
}

static IEnumerable<string> SplitIntoWords(string keyphrase)
{
  throw new NotImplementedException();
}

static void ReplaceWords（）
{
字符串inputFileName=null；
字符串outputFileName=null；
//这本词典把能找到的每一个单词都对应起来
//在任何关键短语中添加包含它的关键短语列表。
IDictionary singleWordMap=null；
使用（变量源=新的StreamReader（inputFileName））
{
使用（var target=newstreamwriter（outputFileName））
{
弦线；
而（（line=source.ReadLine（））！=null）
{
//首先，我们将每一行分割成一个单词——一个搜索单元
var singleWords=拆分为单词（行）；
var结果=新的StringBuilder（线）；
//对于行中的每个单词
foreach（singleWords中的var singleWord）
{
//检查该词是否存在于任何我们应该替换的关键字短语中
//如果是这样的话，请获取相关原始关键短语的列表
最有趣的关键短语；
if（！singleWordMap.TryGetValue（singleWord，out interestingkeyphases））
继续；
static void ReplaceWords()
{
  string inputFileName = null;
  string outputFileName = null;

  // this dictionary maps each single word that can be found
  // in any keyphrase to a list of the keyphrases that contain it.
  IDictionary<string, IList<string>> singleWordMap = null;

  using (var source = new StreamReader(inputFileName))
  {
    using (var target = new StreamWriter(outputFileName))
    {
      string line;
      while ((line = source.ReadLine()) != null)
      {
        // first, we split each line into a single word - a unit of search
        var singleWords = SplitIntoWords(line);

        var result = new StringBuilder(line);
        // for each single word in the line
        foreach (var singleWord in singleWords)
        {
          // check if the word exists in any keyphrase we should replace
          // and if so, get the list of the related original keyphrases
          IList<string> interestingKeyPhrases;
          if (!singleWordMap.TryGetValue(singleWord, out interestingKeyPhrases))
            continue;

          Debug.Assert(interestingKeyPhrases != null && interestingKeyPhrases.Count > 0);

          // then process each of the keyphrases
          foreach (var interestingKeyphrase in interestingKeyPhrases)
          {
            // and replace it in the processed line if it exists
            result.Replace(interestingKeyphrase, GetTargetValue(interestingKeyphrase));
          }
        }

        // now, save the processed line
        target.WriteLine(result);
      }
    }
  }
}

private static string GetTargetValue(string interestingKeyword)
{
  throw new NotImplementedException();
}

static IEnumerable<string> SplitIntoWords(string keyphrase)
{
  throw new NotImplementedException();
}