C# 如何加速这段代码?
我得到了以下方法,用于读取txt文件并返回字典。读取约5MB的文件需要约7分钟(67000行,每行70个字符)。C# 如何加速这段代码?,c#,bioinformatics,C#,Bioinformatics,我得到了以下方法,用于读取txt文件并返回字典。读取约5MB的文件需要约7分钟(67000行,每行70个字符)。 公共静态字典FASTAFileReadIn(字符串文件) { Dictionary seq=新字典(); Regex-re; 匹配m; 分组收集组; string currentName=string.Empty; 尝试 { 使用(StreamReader sr=新StreamReader(文件)) { string line=string.Empty; 而((line=sr.Re
公共静态字典FASTAFileReadIn(字符串文件)
{
Dictionary seq=新字典();
Regex-re;
匹配m;
分组收集组;
string currentName=string.Empty;
尝试
{
使用(StreamReader sr=新StreamReader(文件))
{
string line=string.Empty;
而((line=sr.ReadLine())!=null)
{
if(第行开始时带(“>”)
{//匹配序列
re=新正则表达式(@“^>(\S+);
m=重新匹配(线);
如果(m.成功)
{
组=m组;
如果(!seq.ContainsKey(组[1].Value))
{
seq.Add(组[1]。值,字符串。空);
currentName=组[1]。值;
}
}
}
else if(Regex.Match(line.Trim(),@“\S+”)。成功&&
currentName!=字符串。空)
{
seq[currentName]+=line.Trim();
}
}
}
}
捕获(IOE异常)
{
WriteLine(“抛出了一个IO异常!”);
Console.WriteLine(如ToString());
}
最后{}
返回顺序;
}
代码的哪一部分最耗时,如何加速
谢谢我希望编译器会自动执行此操作,但我注意到的第一件事是,您正在每一行上编译正则表达式:
while ((line = sr.ReadLine()) != null)
{
if (line.StartsWith(">"))
{// Match Sequence
re = new Regex(@"^>(\S+)");
如果可以完全删除正则表达式,那就更好了;大多数语言都提供了某种类型的
split
函数,这种函数通常会冒用正则表达式…您可以通过使用以下函数大幅提高阅读速度:
不过,如果处理时间为5分钟左右,那么提到的
Regex
recompile@sarnold可能是您最大的性能杀手。缓存和编译正则表达式、重新排序条件、减少修剪次数等等
public static Dictionary<string, string> FASTAFileReadIn(string file) {
var seq = new Dictionary<string, string>();
Regex re = new Regex(@"^>(\S+)", RegexOptions.Compiled);
Regex nonWhitespace = new Regex(@"\S", RegexOptions.Compiled);
Match m;
string currentName = string.Empty;
try {
foreach(string line in File.ReadLines(file)) {
if(line[0] == '>') {
m = re.Match(line);
if(m.Success) {
if(!seq.ContainsKey(m.Groups[1].Value)) {
seq.Add(m.Groups[1].Value, string.Empty);
currentName = m.Groups[1].Value;
}
}
} else if(currentName != string.Empty) {
if(nonWhitespace.IsMatch(line)) {
seq[currentName] += line.Trim();
}
}
}
} catch(IOException e) {
Console.WriteLine("An IO exception has been thrown!");
Console.WriteLine(e.ToString());
}
return seq;
}
公共静态字典FASTAFileReadIn(字符串文件){
var seq=新字典();
Regex re=new Regex(@“^>(\S+),RegexOptions.Compiled);
正则表达式非空白=新正则表达式(@“\S”,RegexOptions.Compiled);
匹配m;
string currentName=string.Empty;
试一试{
foreach(文件中的字符串行。ReadLines(文件)){
如果(第[0]行=='>'){
m=重新匹配(线);
如果(m.成功){
如果(!seq.ContainsKey(m.Groups[1].Value)){
seq.Add(m.Groups[1]。Value,string.Empty);
currentName=m.Groups[1]。值;
}
}
}else if(currentName!=string.Empty){
if(非空白.IsMatch(行)){
seq[currentName]+=line.Trim();
}
}
}
}捕获(IOE异常){
WriteLine(“已引发IO异常!”);
Console.WriteLine(如ToString());
}
返回顺序;
}
然而,这只是一个幼稚的优化。在阅读FASTA格式时,我写道:
public static Dictionary<string, string> ReadFasta(string filename) {
var result = new Dictionary<string, string>
var current = new StringBuilder();
string currentKey = null;
foreach(string line in File.ReadLines(filename)) {
if(line[0] == '>') {
if(currentKey != null) {
result.Add(currentKey, current.ToString());
current.Clear();
}
int i = line.IndexOf(' ', 2);
currentKey = i > -1 ? line.Substring(1, i - 1) : line.Substring(1);
} else if(currentKey != null) {
current.Append(line.TrimEnd());
}
}
if(currentKey != null)
result.Add(currentKey, current.ToString());
return result;
}
publicstaticdictionary ReadFasta(字符串文件名){
var result=新字典
var current=新的StringBuilder();
字符串currentKey=null;
foreach(文件中的字符串行。ReadLines(文件名)){
如果(第[0]行=='>'){
如果(currentKey!=null){
Add(currentKey,current.ToString());
current.Clear();
}
int i=第1行索引(“”,2);
currentKey=i>-1?行子串(1,i-1):行子串(1);
}else if(currentKey!=null){
current.Append(line.TrimEnd());
}
}
如果(currentKey!=null)
Add(currentKey,current.ToString());
返回结果;
}
告诉我它是否有效;它应该快得多。以下是我将如何编写它。如果没有更多信息(即平均字典条目的长度),我无法优化StingBuilder的容量。您还可以按照Eric J.的建议添加一个
BufferedStream
。理想情况下,如果您想提高性能,可以完全取消正则表达式,但它们更易于编写和管理,因此我理解您为什么要使用它们
public static Dictionary<string, StringBuilder> FASTAFileReadIn(string file)
{
var seq = new Dictionary<string, StringBuilder>();
var regName = new Regex("^>(\\S+)", RegexOptions.Compiled);
var regAppend = new Regex("\\S+", RegexOptions.Compiled);
Match tempMatch = null;
string currentName = string.Empty;
try
{
using (StreamReader sReader = new StreamReader(file))
{
string line = string.Empty;
while ((line = sReader.ReadLine()) != null)
{
if ((tempMatch = regName.Match(line)).Success)
{
if (!seq.ContainsKey(tempMatch.Groups[1].Value))
{
currentName = tempMatch.Groups[1].Value;
seq.Add(currentName, new StringBuilder());
}
}
else if ((tempMatch = regAppend.Match(line)).Success && currentName != string.Empty)
{
seq[currentName].Append(tempMatch.Value);
}
}
}
}
catch (IOException e)
{
Console.WriteLine("An IO exception has been thrown!");
Console.WriteLine(e.ToString());
}
return seq;
}
公共静态字典FASTAFileReadIn(字符串文件)
{
var seq=新字典();
var regName=new Regex(“^>(\\S+”),RegexOptions.Compiled);
var regAppend=new Regex(“\\S+”,RegexOptions.Compiled);
Match tempMatch=null;
string currentName=string.Empty;
尝试
{
使用(StreamReader sReader=新StreamReader(文件))
{
string line=string.Empty;
而((line=sReader.ReadLine())!=null)
{
if((tempMatch=regName.Match(line)).Success)
{
if(!seq.ContainsKey(tempMatch.Groups[1].Value))
{
currentName=tempMatch.Groups[1]。值;
seq.Add(currentName,new StringBuilder());
}
}
else if((tempMatch=regAppend.Match(line)).Success&¤tName!=string.Empty)
{
seq[currentName].Append(tempMatch.Value);
}
}
}
}
捕获(IOE异常)
{
WriteLine(“已引发IO异常!”);
Console.WriteLine(如ToString());
}
返回顺序;
}
如您所见,我稍微更改了您的字典,使用优化的StringBuilder
类来附加值。我也有预科
public static Dictionary<string, string> ReadFasta(string filename) {
var result = new Dictionary<string, string>
var current = new StringBuilder();
string currentKey = null;
foreach(string line in File.ReadLines(filename)) {
if(line[0] == '>') {
if(currentKey != null) {
result.Add(currentKey, current.ToString());
current.Clear();
}
int i = line.IndexOf(' ', 2);
currentKey = i > -1 ? line.Substring(1, i - 1) : line.Substring(1);
} else if(currentKey != null) {
current.Append(line.TrimEnd());
}
}
if(currentKey != null)
result.Add(currentKey, current.ToString());
return result;
}
public static Dictionary<string, StringBuilder> FASTAFileReadIn(string file)
{
var seq = new Dictionary<string, StringBuilder>();
var regName = new Regex("^>(\\S+)", RegexOptions.Compiled);
var regAppend = new Regex("\\S+", RegexOptions.Compiled);
Match tempMatch = null;
string currentName = string.Empty;
try
{
using (StreamReader sReader = new StreamReader(file))
{
string line = string.Empty;
while ((line = sReader.ReadLine()) != null)
{
if ((tempMatch = regName.Match(line)).Success)
{
if (!seq.ContainsKey(tempMatch.Groups[1].Value))
{
currentName = tempMatch.Groups[1].Value;
seq.Add(currentName, new StringBuilder());
}
}
else if ((tempMatch = regAppend.Match(line)).Success && currentName != string.Empty)
{
seq[currentName].Append(tempMatch.Value);
}
}
}
}
catch (IOException e)
{
Console.WriteLine("An IO exception has been thrown!");
Console.WriteLine(e.ToString());
}
return seq;
}