Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/csharp/266.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
c#多线程处理100个大文件行_C#_Multithreading_Asynchronous_Async Await_Batch Processing - Fatal编程技术网

c#多线程处理100个大文件行

c#多线程处理100个大文件行,c#,multithreading,asynchronous,async-await,batch-processing,C#,Multithreading,Asynchronous,Async Await,Batch Processing,我有一个500.000.000行的文件 public class FileProcessor { public async Task ProcessFile() { List<Task> tasks = new List<Task>(); var lines = File.ReadAllLines("File.txt").Batch(100); foreach (IEnumerable<string&g

我有一个500.000.000行的文件

public class FileProcessor
{
    public async Task ProcessFile()
    {
        List<Task> tasks = new List<Task>();
        var lines = File.ReadAllLines("File.txt").Batch(100);
        foreach (IEnumerable<string> linesBatch in lines)
        {
            IEnumerable<string> localLinesBatch = linesBatch;
            Task task = Task.Factory.StartNew(() =>
            {
                // Perform operation on localLinesBatch
            });
            tasks.Add(task);
        }

        await Task.WhenAll(tasks);
    }
}

public static class LinqExtensions
{
    public static IEnumerable<IEnumerable<TSource>> Batch<TSource>(
              this IEnumerable<TSource> source, int size)
    {
        TSource[] bucket = null;
        var count = 0;

        foreach (var item in source)
        {
            if (bucket == null)
                bucket = new TSource[size];

            bucket[count++] = item;
            if (count != size)
                continue;

            yield return bucket;

            bucket = null;
            count = 0;
        }

        if (bucket != null && count > 0)
            yield return bucket.Take(count);
    }
}
这些行是最多10个字符的字符串


如何使用多线程和100行批处理此文件?

使用MoreLinq的
批处理方法,这将创建一个
IEnumerable
集合,其中包含行批大小为100的行,它将为每100行旋转一个新任务

这是一个基本的实现,明智的做法是使用
信号量
在任何给定的时间只运行一定数量的任务,并查看
文件.ReadAllLines
对500000000行的性能有何影响

public class FileProcessor
{
    public async Task ProcessFile()
    {
        List<Task> tasks = new List<Task>();
        var lines = File.ReadAllLines("File.txt").Batch(100);
        foreach (IEnumerable<string> linesBatch in lines)
        {
            IEnumerable<string> localLinesBatch = linesBatch;
            Task task = Task.Factory.StartNew(() =>
            {
                // Perform operation on localLinesBatch
            });
            tasks.Add(task);
        }

        await Task.WhenAll(tasks);
    }
}

public static class LinqExtensions
{
    public static IEnumerable<IEnumerable<TSource>> Batch<TSource>(
              this IEnumerable<TSource> source, int size)
    {
        TSource[] bucket = null;
        var count = 0;

        foreach (var item in source)
        {
            if (bucket == null)
                bucket = new TSource[size];

            bucket[count++] = item;
            if (count != size)
                continue;

            yield return bucket;

            bucket = null;
            count = 0;
        }

        if (bucket != null && count > 0)
            yield return bucket.Take(count);
    }
}
公共类文件处理器
{
公共异步任务ProcessFile()
{
列表任务=新列表();
var lines=File.ReadAllLines(“File.txt”).Batch(100);
foreach(IEnumerable lines按行分段)
{
IEnumerable localLinesBatch=linesBatch;
Task Task=Task.Factory.StartNew(()=>
{
//在localLinesBatch上执行操作
});
任务。添加(任务);
}
等待任务。何时(任务);
}
}
公共静态类LinqExtensions
{
公共静态IEnumerable批(
此IEnumerable源(整数大小)
{
TSource[]bucket=null;
var计数=0;
foreach(源中的var项)
{
if(bucket==null)
bucket=新的TSource[size];
桶[count++]=物料;
如果(计数!=大小)
持续
回程铲斗;
bucket=null;
计数=0;
}
if(bucket!=null&&count>0)
产量返回桶。取(计数);
}
}

如果使用内置TPL中的
Parallel.ForEach
并编写两个枚举数(如下所列),则不需要使用其他库。您的代码可以如下所示:

using (var input = new StreamReader(File.OpenRead(@"c:\path\to\my\file.txt")))
{
    Parallel.ForEach(
        input.ReadLines().TakeChunks(100),
        new ParallelOptions() { MaxDegreeOfParallelism = 8 /* better be number of CPU cores */ },
        batchOfLines => {
            DoMyProcessing(batchOfLines);
        });
}
为此,您需要在
IEnumerable
上使用两个扩展方法和两个枚举数,定义如下:

public static class EnumerableExtensions
{
    public static IEnumerable<string> ReadLines(this StreamReader input)
    {
        return new LineReadingEnumerable(input);
    }

    public static IEnumerable<IReadOnlyList<T>> TakeChunks<T>(this IEnumerable<T> source, int length)
    {
        return new ChunkingEnumerable<T>(source, length);
    }

    public class LineReadingEnumerable : IEnumerable<string>
    {
        private readonly StreamReader _input;

        public LineReadingEnumerable(StreamReader input)
        {
            _input = input;
        }
        public IEnumerator<string> GetEnumerator()
        {
            return new LineReadingEnumerator(_input);
        }
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }
    }

    public class LineReadingEnumerator : IEnumerator<string>
    {
        private readonly StreamReader _input;
        private string _current;

        public LineReadingEnumerator(StreamReader input)
        {
            _input = input;
        }
        public void Dispose()
        {
            _input.Dispose();
        }
        public bool MoveNext()
        {
            _current = _input.ReadLine();
            return (_current != null);
        }
        public void Reset()
        {
            throw new NotSupportedException();
        }
        public string Current
        {
            get { return _current; }
        }
        object IEnumerator.Current
        {
            get { return _current; }
        }
    }

    public class ChunkingEnumerable<T> : IEnumerable<IReadOnlyList<T>>
    {
        private readonly IEnumerable<T> _inner;
        private readonly int _length;

        public ChunkingEnumerable(IEnumerable<T> inner, int length)
        {
            _inner = inner;
            _length = length;
        }
        public IEnumerator<IReadOnlyList<T>> GetEnumerator()
        {
            return new ChunkingEnumerator<T>(_inner.GetEnumerator(), _length);
        }
        IEnumerator IEnumerable.GetEnumerator()
        {
            return this.GetEnumerator();
        }
    }

    public class ChunkingEnumerator<T> : IEnumerator<IReadOnlyList<T>>
    {
        private readonly IEnumerator<T> _inner;
        private readonly int _length;
        private IReadOnlyList<T> _current;
        private bool _endOfInner;

        public ChunkingEnumerator(IEnumerator<T> inner, int length)
        {
            _inner = inner;
            _length = length;
        }
        public void Dispose()
        {
            _inner.Dispose();
            _current = null;
        }
        public bool MoveNext()
        {
            var currentBuffer = new List<T>();

            while (currentBuffer.Count < _length && !_endOfInner)
            {
                if (!_inner.MoveNext())
                {
                    _endOfInner = true;
                    break;
                }

                currentBuffer.Add(_inner.Current);
            }

            if (currentBuffer.Count > 0)
            {
                _current = currentBuffer;
                return true;
            }

            _current = null;
            return false;
        }
        public void Reset()
        {
            _inner.Reset();
            _current = null;
            _endOfInner = false;
        }
        public IReadOnlyList<T> Current
        {
            get
            {
                if (_current != null)
                {
                    return _current;
                }

                throw new InvalidOperationException();
            }
        }
        object IEnumerator.Current
        {
            get
            {
                return this.Current;
            }
        }
    }
}
公共静态类EnumerableExtensions
{
公共静态IEnumerable读线(此StreamReader输入)
{
返回新行readingnumerable(输入);
}
公共静态IEnumerable TakeChunks(此IEnumerable源,int-length)
{
返回新的chunkingnumerable(源、长度);
}
公共类LineReadingEnumerable:IEnumerable
{
私有只读StreamReader\u输入;
public LineReadingEnumerable(StreamReader输入)
{
_输入=输入;
}
公共IEnumerator GetEnumerator()
{
返回新的LineReadingEnumerator(_输入);
}
IEnumerator IEnumerable.GetEnumerator()
{
返回GetEnumerator();
}
}
公共类LineReadingEnumerator:IEnumerator
{
私有只读StreamReader\u输入;
私有字符串_当前;
公用LineReadingEnumerator(StreamReader输入)
{
_输入=输入;
}
公共空间处置()
{
_input.Dispose();
}
公共图书馆
{
_当前=_input.ReadLine();
返回(_current!=null);
}
公共无效重置()
{
抛出新的NotSupportedException();
}
公共字符串电流
{
获取{return\u current;}
}
对象IEnumerator.Current
{
获取{return\u current;}
}
}
公共类chunkingnumerable:IEnumerable
{
私有只读IEnumerable\u-inner;
私有只读整数长度;
公共chunkingnumerable(IEnumerable-inner,int-length)
{
_内部=内部;
_长度=长度;
}
公共IEnumerator GetEnumerator()
{
返回新的ChunkingEnumerator(_inner.GetEnumerator(),_length);
}
IEnumerator IEnumerable.GetEnumerator()
{
返回此.GetEnumerator();
}
}
公共类ChunkingEnumerator:IEnumerator
{
私有只读IEnumerator\u内部;
私有只读整数长度;
私有IReadOnlyList(当前);
私人银行;
公共ChunkingEnumerator(IEnumerator内部,整数长度)
{
_内部=内部;
_长度=长度;
}
公共空间处置()
{
_depose();
_电流=零;
}
公共图书馆
{
var currentBuffer=新列表();
while(currentBuffer.Count<\u length&!\u endofiner)
{
如果(!\u inner.MoveNext())
{
_Endofiner=true;
打破
}
currentBuffer.Add(_inner.Current);
}
如果(currentBuffer.Count>0)
{
_当前=当前缓冲区;
返回true;
}
_电流=零;
返回false;
}
公共无效重置()
{
_内部重置();
_电流=零;
_endofiner=false;
}
公共IReadOnlyList当前
{
收到
{
如果(_current!=null)
{
返回电流;
}
抛出新的InvalidOperationException();
}
}
对象IEnumerator.Current
{
收到
{
返回此.Current;
}
}
}
}

对于批处理,请查看此处编写代码,然后进行比较。。。。。或者最好阅读文档,了解如何提出一个好问题……多线程对您没有帮助,因为I/O是瓶颈,您只有一个到硬盘的总线连接。I/O是意甲联赛