C# 如何加速这段代码？_C#_Bioinformatics

C# 如何加速这段代码？

C# 如何加速这段代码？,c#,bioinformatics,C#,Bioinformatics,我得到了以下方法，用于读取txt文件并返回字典。读取约5MB的文件需要约7分钟（67000行，每行70个字符）。公共静态字典FASTAFileReadIn（字符串文件） { Dictionary seq=新字典（）； Regex-re；匹配m；分组收集组； string currentName=string.Empty；尝试 { 使用（StreamReader sr=新StreamReader（文件）） { string line=string.Empty；而（（line=sr.Re

我得到了以下方法，用于读取txt文件并返回字典。读取约5MB的文件需要约7分钟（67000行，每行70个字符）。

公共静态字典FASTAFileReadIn（字符串文件）
{
Dictionary seq=新字典（）；
Regex-re；
匹配m；
分组收集组；
string currentName=string.Empty；
尝试
{
使用（StreamReader sr=新StreamReader（文件））
{
string line=string.Empty；
而（（line=sr.ReadLine（））！=null）
{
if（第行开始时带（“>”）
{//匹配序列
re=新正则表达式（@“^>（\S+）；
m=重新匹配（线）；
如果（m.成功）
{
组=m组；
如果（！seq.ContainsKey（组[1].Value））
{
seq.Add（组[1]。值，字符串。空）；
currentName=组[1]。值；
}
}
}
else if（Regex.Match（line.Trim（），@“\S+”）。成功&&
currentName！=字符串。空）
{
seq[currentName]+=line.Trim（）；
}
}
}
}
捕获（IOE异常）
{
WriteLine（“抛出了一个IO异常！”）；
Console.WriteLine（如ToString（））；
}
最后{}
返回顺序；
}

代码的哪一部分最耗时，如何加速

谢谢

我希望编译器会自动执行此操作，但我注意到的第一件事是，您正在每一行上编译正则表达式：

            while ((line = sr.ReadLine()) != null)
            {
                if (line.StartsWith(">"))
                {// Match Sequence
                    re = new Regex(@"^>(\S+)");

如果可以完全删除正则表达式，那就更好了；大多数语言都提供了某种类型的

split

函数，这种函数通常会冒用正则表达式…

您可以通过使用以下函数大幅提高阅读速度：

不过，如果处理时间为5分钟左右，那么提到的

Regex

recompile@sarnold可能是您最大的性能杀手。

缓存和编译正则表达式、重新排序条件、减少修剪次数等等

public static Dictionary<string, string> FASTAFileReadIn(string file) {
    var seq = new Dictionary<string, string>();

    Regex re = new Regex(@"^>(\S+)", RegexOptions.Compiled);
    Regex nonWhitespace = new Regex(@"\S", RegexOptions.Compiled);
    Match m;
    string currentName = string.Empty;

    try {
        foreach(string line in File.ReadLines(file)) {
            if(line[0] == '>') {
                m = re.Match(line);

                if(m.Success) {
                    if(!seq.ContainsKey(m.Groups[1].Value)) {
                        seq.Add(m.Groups[1].Value, string.Empty);
                        currentName = m.Groups[1].Value;
                    }
                }
            } else if(currentName != string.Empty) {
                if(nonWhitespace.IsMatch(line)) {
                    seq[currentName] += line.Trim();
                }
            }
        }
    } catch(IOException e) {
        Console.WriteLine("An IO exception has been thrown!");
        Console.WriteLine(e.ToString());
    }

    return seq;
}

公共静态字典FASTAFileReadIn（字符串文件）{
var seq=新字典（）；
Regex re=new Regex（@“^>（\S+），RegexOptions.Compiled）；
正则表达式非空白=新正则表达式（@“\S”，RegexOptions.Compiled）；
匹配m；
string currentName=string.Empty；
试一试{
foreach（文件中的字符串行。ReadLines（文件））{
如果（第[0]行=='>'）{
m=重新匹配（线）；
如果（m.成功）{
如果（！seq.ContainsKey（m.Groups[1].Value））{
seq.Add（m.Groups[1]。Value，string.Empty）；
currentName=m.Groups[1]。值；
}
}
}else if（currentName！=string.Empty）{
if（非空白.IsMatch（行））{
seq[currentName]+=line.Trim（）；
}
}
}
}捕获（IOE异常）{
WriteLine（“已引发IO异常！”）；
Console.WriteLine（如ToString（））；
}
返回顺序；
}

然而，这只是一个幼稚的优化。在阅读FASTA格式时，我写道：

public static Dictionary<string, string> ReadFasta(string filename) {
    var result = new Dictionary<string, string>
    var current = new StringBuilder();
    string currentKey = null;

    foreach(string line in File.ReadLines(filename)) {
        if(line[0] == '>') {
            if(currentKey != null) {
                result.Add(currentKey, current.ToString());
                current.Clear();
            }

            int i = line.IndexOf(' ', 2);

            currentKey = i > -1 ? line.Substring(1, i - 1) : line.Substring(1);
        } else if(currentKey != null) {
            current.Append(line.TrimEnd());
        }
    }

    if(currentKey != null)
        result.Add(currentKey, current.ToString());

    return result;
}

publicstaticdictionary ReadFasta（字符串文件名）{
var result=新字典
var current=新的StringBuilder（）；
字符串currentKey=null；
foreach（文件中的字符串行。ReadLines（文件名））{
如果（第[0]行=='>'）{
如果（currentKey！=null）{
Add（currentKey，current.ToString（））；
current.Clear（）；
}
int i=第1行索引（“”，2）；
currentKey=i>-1？行子串（1，i-1）：行子串（1）；
}else if（currentKey！=null）{
current.Append（line.TrimEnd（））；
}
}
如果（currentKey！=null）
Add（currentKey，current.ToString（））；
返回结果；
}

告诉我它是否有效；它应该快得多。

以下是我将如何编写它。如果没有更多信息（即平均字典条目的长度），我无法优化StingBuilder的容量。您还可以按照Eric J.的建议添加一个

BufferedStream

。理想情况下，如果您想提高性能，可以完全取消

正则表达式，但它们更易于编写和管理，因此我理解您为什么要使用它们
public static Dictionary<string, StringBuilder> FASTAFileReadIn(string file)
{
    var seq = new Dictionary<string, StringBuilder>();
    var regName = new Regex("^>(\\S+)", RegexOptions.Compiled);
    var regAppend = new Regex("\\S+", RegexOptions.Compiled);

    Match tempMatch = null;
    string currentName = string.Empty;
    try
    {
        using (StreamReader sReader = new StreamReader(file))
        {
            string line = string.Empty;
            while ((line = sReader.ReadLine()) != null)
            {
                if ((tempMatch = regName.Match(line)).Success)
                {
                    if (!seq.ContainsKey(tempMatch.Groups[1].Value))
                    {
                        currentName = tempMatch.Groups[1].Value;
                        seq.Add(currentName, new StringBuilder());
                    }
                }
                else if ((tempMatch = regAppend.Match(line)).Success && currentName != string.Empty)
                {
                    seq[currentName].Append(tempMatch.Value);
                }
            }
        }
    }
    catch (IOException e)
    {
        Console.WriteLine("An IO exception has been thrown!");
        Console.WriteLine(e.ToString());
    }

    return seq;
}

公共静态字典FASTAFileReadIn（字符串文件）
{
var seq=新字典（）；
var regName=new Regex（“^>（\\S+”），RegexOptions.Compiled）；
var regAppend=new Regex（“\\S+”，RegexOptions.Compiled）；
Match tempMatch=null；
string currentName=string.Empty；
尝试
{
使用（StreamReader sReader=新StreamReader（文件））
{
string line=string.Empty；
而（（line=sReader.ReadLine（））！=null）
{
if（（tempMatch=regName.Match（line））.Success）
{
if（！seq.ContainsKey（tempMatch.Groups[1].Value））
{
currentName=tempMatch.Groups[1]。值；
seq.Add（currentName，new StringBuilder（））；
}
}
else if（（tempMatch=regAppend.Match（line））.Success&¤tName！=string.Empty）
{
seq[currentName].Append（tempMatch.Value）；
}
}
}
}
捕获（IOE异常）
{
WriteLine（“已引发IO异常！”）；
Console.WriteLine（如ToString（））；
}
返回顺序；
}

如您所见，我稍微更改了您的字典，使用优化的StringBuilder类来附加值。我也有预科
public static Dictionary<string, string> ReadFasta(string filename) {
    var result = new Dictionary<string, string>
    var current = new StringBuilder();
    string currentKey = null;

    foreach(string line in File.ReadLines(filename)) {
        if(line[0] == '>') {
            if(currentKey != null) {
                result.Add(currentKey, current.ToString());
                current.Clear();
            }

            int i = line.IndexOf(' ', 2);

            currentKey = i > -1 ? line.Substring(1, i - 1) : line.Substring(1);
        } else if(currentKey != null) {
            current.Append(line.TrimEnd());
        }
    }

    if(currentKey != null)
        result.Add(currentKey, current.ToString());

    return result;
}

public static Dictionary<string, StringBuilder> FASTAFileReadIn(string file)
{
    var seq = new Dictionary<string, StringBuilder>();
    var regName = new Regex("^>(\\S+)", RegexOptions.Compiled);
    var regAppend = new Regex("\\S+", RegexOptions.Compiled);

    Match tempMatch = null;
    string currentName = string.Empty;
    try
    {
        using (StreamReader sReader = new StreamReader(file))
        {
            string line = string.Empty;
            while ((line = sReader.ReadLine()) != null)
            {
                if ((tempMatch = regName.Match(line)).Success)
                {
                    if (!seq.ContainsKey(tempMatch.Groups[1].Value))
                    {
                        currentName = tempMatch.Groups[1].Value;
                        seq.Add(currentName, new StringBuilder());
                    }
                }
                else if ((tempMatch = regAppend.Match(line)).Success && currentName != string.Empty)
                {
                    seq[currentName].Append(tempMatch.Value);
                }
            }
        }
    }
    catch (IOException e)
    {
        Console.WriteLine("An IO exception has been thrown!");
        Console.WriteLine(e.ToString());
    }

    return seq;
}