如何在C#中解析文本文件并进行io绑定?

如何在C#中解析文本文件并进行io绑定?,c#,performance,file-io,C#,Performance,File Io,众所周知,如果您从光盘读取数据,您将受到IO限制,并且您可以比从光盘读取数据更快地处理/解析读取的数据 但这种普遍的智慧(神话?)并没有反映在我的测试中。当我读取一个文本文件时,每行都有一个双精度and和int,并用一个空格隔开,我的速度比我的物理磁盘速度慢得多(系数6)。 文本文件如下所示 1,1 0 2,1 1 3,1 2 更新 当我一次读取一个包含完整缓冲区的读取文件以获得“真实”性能时,我已经包括了PInvoke性能 ReadFile性能-ReadFileIntoByteBuffer

众所周知,如果您从光盘读取数据,您将受到IO限制,并且您可以比从光盘读取数据更快地处理/解析读取的数据

但这种普遍的智慧(神话?)并没有反映在我的测试中。当我读取一个文本文件时,每行都有一个双精度and和int,并用一个空格隔开,我的速度比我的物理磁盘速度慢得多(系数6)。 文本文件如下所示

1,1 0
2,1 1
3,1 2
更新 当我一次读取一个包含完整缓冲区的读取文件以获得“真实”性能时,我已经包括了PInvoke性能

  • ReadFile性能-ReadFileIntoByteBuffer
  • StringReader.ReadLine性能-CountLines
  • StringReader.Readline不安全性能-ParseLinesUnsafe
  • StringReader.Read不安全字符buf-ParseLinesUnsafeCharBuf
  • StringReader.ReadLine+解析性能-解析行
结果是

Did native read 179,0MB in                    0,4s, 484,2MB/s
Did read 10.000.000 lines in                  1,6s, 112,7MB/s
Did parse and read unsafe 179,0MB in          2,3s,  76,5MB/s
Did parse and read unsafe char buf 179,0MB in 2,8s,  63,5MB/s
Did read and parse 179,0MB in                 9,3s,  19,3MB/s
虽然我尝试跳过ParseLinesUnsafeCharBuf中的字符串构造开销,但它仍然比每次分配新字符串的版本慢很多。使用最简单的解决方案,它仍然比原来的20MB好得多,但我确实认为.NET应该可以做得更好。如果删除解析字符串的逻辑,我会得到258,8 MB/s,这是非常好的,接近本机速度。但我看不到一种使用不安全代码来简化解析的方法。我必须处理不完整的行,这使得它非常复杂

更新 从数字中可以清楚地看出,一个简单的string.split的成本已经太高了。但是StringReader的成本也相当高。一个高度优化的解决方案看起来如何更接近真实的光盘速度?我已经尝试了许多使用不安全代码和字符缓冲区的方法,但是性能提高了30%,但没有达到我需要的数量级。我可以用100MB/s的解析速度。这应该可以通过托管代码实现,还是我错了

用C#解析的速度不可能超过我从硬盘读取的速度吗?这是英特尔Postville X25M。CPU采用的是旧版和旧版Intel双核。我有3 GB内存的Windows7.NET3.5SP1和.NET4

但我在普通硬盘上也看到了同样的结果。今天的硬盘的线性读取速度可达400MB/s。这是否意味着我应该重新构造我的应用程序,以便在实际需要时按需读取数据,而不是急切地将数据读入内存,因为对象图的增加会使GC周期更长,从而以更高的GC时间为代价

如果我的托管应用程序使用超过500MB的内存,它的响应速度就会大大降低。一个主要的影响因素似乎是对象图的复杂性。因此,在需要时读取数据可能更好。至少这是我对当前数据的结论

这是密码

using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Diagnostics;
using System.Runtime.InteropServices;
using Microsoft.Win32.SafeHandles;
using System.ComponentModel;

namespace IOBound
{
    class Program
    {
        static void Main(string[] args)
        {
            string data = @"C:\Source\IOBound\NumericData.txt";
            if (!File.Exists(data))
            {
                CreateTestData(data);
            }

            int MB = (int) (new FileInfo(data).Length/(1024*1024));

            var sw = Stopwatch.StartNew();
            uint bytes = ReadFileIntoByteBuffer(data);
            sw.Stop();
            Console.WriteLine("Did native read {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                bytes/(1024*1024), sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            int n = CountLines(data);
            sw.Stop();
            Console.WriteLine("Did read {0:N0} lines in {1:F1}s, {2:F1}MB/s",
                n, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            ParseLinesUnsafe(data);
            sw.Stop();
            Console.WriteLine("Did parse and read unsafe {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            ParseLinesUnsafeCharBuf(data);
            sw.Stop();
            Console.WriteLine("Did parse and read unsafe char buf {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

            sw = Stopwatch.StartNew();
            ParseLines(data);
            sw.Stop();
            Console.WriteLine("Did read and parse {0:F1}MB in {1:F1}s, {2:F1}MB/s",
                MB, sw.Elapsed.TotalSeconds, MB / sw.Elapsed.TotalSeconds);

        }

        private unsafe static uint ReadFileIntoByteBuffer(string data)
        {
            using(var stream = new FileStream(data, FileMode.Open))
            {
                byte[] buf = new byte[200 * 1024 * 1024];
                fixed(byte* pBuf = &buf[0])
                {
                    uint dwRead = 0;
                    if (ReadFile(stream.SafeFileHandle, pBuf, 200 * 1000 * 1000, out dwRead, IntPtr.Zero) == 0)
                    {
                        throw new Win32Exception();
                    }
                    return dwRead;
                }

            }
        }

        private static int CountLines(string data)
        {
            using (var reader = new StreamReader(data))
            {
                string line;
                int count = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    count++;
                }

                return count;
            }
        }

        unsafe private static void ParseLinesUnsafeCharBuf(string data)
        {
            var dobules = new List<double>();
            var ints = new List<int>();

            using (var reader = new StreamReader(data))
            {
                double d = 0;
                long a = 0, b = 0;
                int i = 0;
                char[] buffer = new char[10*1000*1000];
                int readChars = 0;
                int startIdx = 0;

                fixed(char *ln = buffer)
                {
                    while ((readChars = reader.Read(buffer, startIdx, buffer.Length - startIdx)) != 0)
                    {
                        char* pEnd = ln + readChars + startIdx;
                        char* pCur = ln;
                        char* pLineStart = null;

                        while (pCur != pEnd)
                        {
                            a = 0;
                            b = 0;

                            while (pCur != pEnd && *pCur == '\r' || *pCur == '\n')
                            {
                                pCur++;
                            }
                            pLineStart = pCur;

                            while(pCur != pEnd && char.IsNumber(*pCur))
                            {
                                a = a * 10 + (*pCur++ - '0');
                            }
                            if (pCur == pEnd || *pCur == '\r')
                            {
                                goto incompleteLine;
                            }

                            if (*pCur++ == ',')
                            {
                                long div = 1;
                                while (pCur != pEnd && char.IsNumber(*pCur))
                                {
                                    b += b * 10 + (*pCur++ - '0');
                                    div *= 10;
                                }
                                if (pCur == pEnd || *pCur == '\r')
                                {
                                    goto incompleteLine;
                                }
                                d = a + ((double)b) / div;
                            }
                            else
                            {
                                goto skipRest;
                            }

                            while (pCur != pEnd && char.IsWhiteSpace(*pCur))
                            {
                                pCur++;
                            }
                            if (pCur == pEnd || *pCur == '\r')
                            {
                                goto incompleteLine;
                            }

                            i = 0;
                            while (pCur != pEnd && char.IsNumber(*pCur))
                            {
                                i = i * 10 + (*pCur++ - '0');
                            }
                            if (pCur == pEnd)
                            {
                                goto incompleteLine;
                            }

                            dobules.Add(d);
                            ints.Add(i);

                            continue;

incompleteLine:
                            startIdx = (int)(pEnd - pLineStart);
                            Buffer.BlockCopy(buffer, (int)(pLineStart - ln) * 2, buffer, 0, 2 * startIdx);
                            break;
skipRest:
                            while (pCur != pEnd && *pCur != '\r')
                            {
                                pCur++;   
                            }
                            continue;
                        }
                    }
                }
            }
        }

        unsafe private static void ParseLinesUnsafe(string data)
        {
            var dobules = new List<double>();
            var ints = new List<int>();

            using (var reader = new StreamReader(data))
            {
                string line;
                double d=0;
                long a = 0, b = 0;
                int ix = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    int len = line.Length;
                    fixed (char* ln = line)
                    {
                        while (ix < len && char.IsNumber(ln[ix]))
                        { 
                            a = a * 10 + (ln[ix++] - '0');
                        }

                        if (ln[ix] == ',')
                        {
                            ix++;
                            long div = 1;
                            while (ix < len && char.IsNumber(ln[ix]))
                            {
                                b += b * 10 + (ln[ix++] - '0');
                                div *= 10;
                            }
                            d = a + ((double)b) / div;
                        }

                        while (ix < len && char.IsWhiteSpace(ln[ix]))
                        {
                            ix++;
                        }

                        int i = 0;
                        while (ix < len && char.IsNumber(ln[ix]))
                        { 
                            i = i * 10 + (ln[ix++] - '0');
                        }

                        dobules.Add(d);
                        ints.Add(ix);
                    }
                }
            }
        }



        private static void ParseLines(string data)
        {
            var dobules = new List<double>();
            var ints = new List<int>();

            using (var reader = new StreamReader(data))
            {
                string line;
                char[] sep  = new char[] { ' ' };
                while ((line = reader.ReadLine()) != null)
                {
                    var parts = line.Split(sep);
                    if (parts.Length == 2)
                    {
                        dobules.Add( double.Parse(parts[0]));
                        ints.Add( int.Parse(parts[1]));
                    }
                }
            }
        }

        static void CreateTestData(string fileName)
        {
            FileStream fstream = new FileStream(fileName, FileMode.Create);
            using (StreamWriter writer = new StreamWriter(fstream, Encoding.UTF8))
            {
                for (int i = 0; i < 10 * 1000 * 1000; i++)
                {
                    writer.WriteLine("{0} {1}", 1.1d + i, i);
                }
            }
        }

        [DllImport("kernel32.dll", SetLastError = true)]
        unsafe static extern uint ReadFile(SafeFileHandle hFile, [Out] byte* lpBuffer, uint nNumberOfBytesToRead, out uint lpNumberOfBytesRead, IntPtr lpOverlapped);

    }
}
使用系统;
使用System.Collections.Generic;
使用系统文本;
使用System.IO;
使用系统诊断;
使用System.Runtime.InteropServices;
使用Microsoft.Win32.SafeHandles;
使用系统组件模型;
命名空间IOBOND
{
班级计划
{
静态void Main(字符串[]参数)
{
字符串数据=@“C:\Source\IOBound\NumericData.txt”;
如果(!File.Exists(data))
{
CreateTestData(数据);
}
intmb=(int)(新文件信息(数据).Length/(1024*1024));
var sw=Stopwatch.StartNew();
uint bytes=ReadFileIntoByteBuffer(数据);
sw.Stop();
WriteLine(“本机读取了{1:F1}s中的{0:F1}MB,{2:F1}MB/s”,
字节/(1024*1024),sw.eassed.TotalSeconds,MB/sw.eassed.TotalSeconds);
sw=秒表。开始新();
int n=计数线(数据);
sw.Stop();
WriteLine(“在{1:F1}s、{2:F1}MB/s中读取了{0:N0}行”,
n、 软件运行总秒数,MB/sw运行总秒数);
sw=秒表。开始新();
ParseLinesUnsafe(数据);
sw.Stop();
WriteLine(“在{1:F1}s,{2:F1}MB/s中解析并读取不安全的{0:F1}MB”,
MB,sw.Appead.TotalSeconds,MB/sw.Appead.TotalSeconds);
sw=秒表。开始新();
ParseLinesUnsafeCharBuf(数据);
sw.Stop();
WriteLine(“在{1:F1}s,{2:F1}MB/s中解析并读取不安全的字符buf{0:F1}MB”,
MB,sw.Appead.TotalSeconds,MB/sw.Appead.TotalSeconds);
sw=秒表。开始新();
解析行(数据);
sw.Stop();
WriteLine(“在{1:F1}s,{2:F1}MB/s中读取并解析了{0:F1}MB”,
MB,sw.Appead.TotalSeconds,MB/sw.Appead.TotalSeconds);
}
私有不安全静态uint ReadFileIntoByteBuffer(字符串数据)
{
使用(var stream=newfilestream(数据,FileMode.Open))
{
字节[]buf=新字节[200*1024*1024];
已修复(字节*pBuf=&buf[0])
{
uint-dwRead=0;
if(ReadFile(stream.SafeFileHandle,pBuf,200*1000*1000,out-dwRead,IntPtr.Zero)==0)
{
抛出新的Win32Exception();
}
返回dwRead;
}
}
}
专用静态整数计数行(字符串数据)
{
使用(变量读取器=新的StreamReader(数据))
{
弦线;
整数计数=0;
而((line=reader.ReadLine())!=null)
{
计数++;
}
返回计数;
}
}
不安全的私有静态void ParseLinesUnsafeCharBuf(字符串数据)
{
var dobules=新列表();
var ints=新列表();
使用(var reader=newstreamreade)
<runtime>
   <gcServer enabled="true" />
</runtime>
int len = line.Length;
fixed (char* ln = line)
{
    double d;
    long a = 0, b = 0;
    int ix = 0;
    while (ix < len && char.IsNumber(ln[ix]))
        a = a * 10 + (ln[ix++] - '0');
    if (ln[ix] == '.')
    {
        ix++;
        long div = 1;
        while (ix < len && char.IsNumber(ln[ix]))
        {
            b += b * 10 + (ln[ix++] - '0');
            div *= 10;
        }
        d = a + ((double)b)/div;
    }

    while (ix < len && char.IsWhiteSpace(ln[ix]))
        ix++;

    int i = 0;
    while (ix < len && char.IsNumber(ln[ix]))
        i = i * 10 + (ln[ix++] - '0');
}
static int CreateTestData(string fileName)
{
    FileStream fstream = new FileStream(fileName, FileMode.Create, FileAccess.Write, FileShare.None, 4096, FileOptions.WriteThrough);
    using (StreamWriter writer = new StreamWriter(fstream, Encoding.UTF8))
    {
        for (int i = 0; i < linecount; i++)
        {
            writer.WriteLine("{0} {1}", 1.1d + i, i);
        }
    }
    return linecount;
}
static int PrintTestData(string fileName)
{
    for (int i = 0; i < linecount; i++)
    {
        String.Format("{0} {1}", 1.1d + i, i);
    }
    return linecount;
}
private static unsafe int ParseFast(string data)
{
    int count = 0, valid = 0, pos, stop, temp;
    byte[] buffer = new byte[ushort.MaxValue];

    const byte Zero = (byte) '0';
    const byte Nine = (byte) '9';
    const byte Dot = (byte)'.';
    const byte Space = (byte)' ';
    const byte Tab = (byte) '\t';
    const byte Line = (byte) '\n';

    fixed (byte *ptr = buffer)
    using (Stream reader = File.OpenRead(data))
    {
        while (0 != (temp = reader.Read(buffer, valid, buffer.Length - valid)))
        {
            valid += temp;
            pos = 0;
            stop = Math.Min(buffer.Length - 1024, valid);
            while (pos < stop)
            {
                double d;
                long a = 0, b = 0;
                while (pos < valid && ptr[pos] >= Zero && ptr[pos] <= Nine)
                    a = a*10 + (ptr[pos++] - Zero);
                if (ptr[pos] == Dot)
                {
                    pos++;
                    long div = 1;
                    while (pos < valid && ptr[pos] >= Zero && ptr[pos] <= Nine)
                    {
                        b += b*10 + (ptr[pos++] - Zero);
                        div *= 10;
                    }
                    d = a + ((double) b)/div;
                }
                else
                    d = a;

                while (pos < valid && (ptr[pos] == Space || ptr[pos] == Tab))
                    pos++;

                int i = 0;
                while (pos < valid && ptr[pos] >= Zero && ptr[pos] <= Nine)
                    i = i*10 + (ptr[pos++] - Zero);

                DoSomething(d, i);

                while (pos < stop && ptr[pos] != Line)
                    pos++;
                while (pos < stop && !(ptr[pos] >= Zero && ptr[pos] <= Nine))
                    pos++;
            }

            if (pos < valid)
                Buffer.BlockCopy(buffer, pos, buffer, 0, valid - pos);
            valid -= pos;
        }
    }
    return count;
}