如何最大化进程吞吐量(C#)?
我想以最大吞吐量处理一些文件。文件的路径保存在数据库中。我需要从数据库中获取文件路径,将其状态更改为“正在处理”,对其进行处理,然后将其状态更改为“已完成”或“失败” 目前,我分批获取文件(100个文件),以减少完成的查询数量并并行处理它们(并行度为10)。但这样一来,在批处理快结束时,我就失去了吞吐量。当批处理中剩余的文件少于10个时,并行度不再是10,而是降低 以下是我所拥有的:如何最大化进程吞吐量(C#)?,c#,performance,system.reactive,tpl-dataflow,blockingcollection,C#,Performance,System.reactive,Tpl Dataflow,Blockingcollection,我想以最大吞吐量处理一些文件。文件的路径保存在数据库中。我需要从数据库中获取文件路径,将其状态更改为“正在处理”,对其进行处理,然后将其状态更改为“已完成”或“失败” 目前,我分批获取文件(100个文件),以减少完成的查询数量并并行处理它们(并行度为10)。但这样一来,在批处理快结束时,我就失去了吞吐量。当批处理中剩余的文件少于10个时,并行度不再是10,而是降低 以下是我所拥有的: private async Task CopyPendingFilesAsync(SourcePath sour
private async Task CopyPendingFilesAsync(SourcePath sourcePath, Options options)
{
var batchIndex = 0;
while (true)
{
var fileBatch = _sourceFileService.GetSourceFileBatchBySourcePathId(
sourcePath.Id, _dataSourceExportConfig.FileCopyBatchSize, Status.Pending);
if (fileBatch.Count == 0)
return;
await SetInProgressStatusForBatch(fileBatch)
.ConfigureAwait(false);
fileBatch
.AsParallel()
.WithDegreeOfParallelism(_dataSourceExportConfig.FileCopyDegreeOfParallelism)
.ForAll(file => ProcessFile(file, destinationBase, options));
await _sourceFileService
.UpdateSourceFilesStatusAsync(fileBatch)
.ConfigureAwait(false);
batchIndex++;
}
}
private async Task SetInProgressStatusForBatch(IEnumerable<SourceFile> fileBatch)
{
foreach (var file in fileBatch)
file.Status = Status.InProgress;
await _sourceFileService
.UpdateSourceFilesStatusAsync(fileBatch)
.ConfigureAwait(false);
}
private void ProcessFile(
SourceFile file,
string destinationBase,
Options options)
{
try
{
//do something ...
file.Status = Status.Success;
file.ExceptionMessage = null;
}
catch (Exception ex)
{
_logger.Error(ex);
file.Status = Status.Failed;
file.ExceptionMessage = ex.Message;
}
}
private async Task copypendingfileasync(SourcePath SourcePath,Options)
{
var batchIndex=0;
while(true)
{
var fileBatch=\u sourceFileService.GetSourceFileBatchBySourcePathId(
sourcePath.Id,_dataSourceExportConfig.FileCopyBatchSize,Status.Pending);
如果(fileBatch.Count==0)
返回;
等待SetInProgressStatusForBatch(fileBatch)
.配置等待(错误);
文件批处理
.天冬酰胺()
.WithDegreeOfParallelism(_dataSourceExportConfig.FileCopyDegreeOfParallelism)
.ForAll(file=>ProcessFile(file,destinationBase,options));
等待(u sourceFileService)
.UpdateSourceFileStatusAsync(文件批处理)
.配置等待(错误);
batchIndex++;
}
}
专用异步任务SetInProgressStatusForBatch(IEnumerable fileBatch)
{
foreach(fileBatch中的var文件)
file.Status=Status.InProgress;
等待(u sourceFileService)
.UpdateSourceFileStatusAsync(文件批处理)
.配置等待(错误);
}
私有void进程文件(
源文件,
字符串destinationBase,
选项(可选)
{
尝试
{
//做点什么。。。
file.Status=Status.Success;
file.ExceptionMessage=null;
}
捕获(例外情况除外)
{
_记录器错误(ex);
file.Status=Status.Failed;
file.ExceptionMessage=ex.Message;
}
}
如何使吞吐量最大化?我读到了关于BlockingCollection、TPL Dataflow和Rx的生产者-消费者模式,我非常确定我想要实现的目标可以通过上面的任何一个实现,但到目前为止我还无法实现。有了生产者-消费者模式,我的生产者比消费者的速度快得多,有了TPL数据流,我被BatchBlock卡住了,我还没有试过Rx。有人能给我指一下正确的方向吗
更新:
以下是一个最小、完整且可验证的示例:
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Threading;
namespace ConsoleApp1
{
internal static class Program
{
private static void Main()
{
Console.WriteLine("Processing files");
var stopWatch = new Stopwatch();
stopWatch.Start();
var fileService = new FileService();
fileService.ProcessPendingFiles();
foreach (var sourceFile in fileService.SourceFiles)
{
Console.WriteLine($"{sourceFile.Id} {sourceFile.Status}");
}
Console.WriteLine(stopWatch.Elapsed);
Console.ReadLine();
}
}
public class FileService
{
private const int BatchSize = 100;
private const int DegreeOfParallelism = 10;
//this SourceFiles property replaces the Sqlite database where the data is actually stored
public ICollection<SourceFile> SourceFiles =
Enumerable
.Range(0, 1000)
.Select(i =>
new SourceFile
{
Id = i,
Path = "source file path",
Status = Status.Pending,
})
.ToList();
public void ProcessPendingFiles()
{
while (true)
{
var fileBatch = GetSourceFileBatch(BatchSize, Status.Pending);
if (fileBatch.Count == 0)
return;
SetInProgressStatusForBatch(fileBatch);
fileBatch
.AsParallel()
.WithDegreeOfParallelism(DegreeOfParallelism)
.ForAll(ProcessFile);
UpdateSourceFiles(fileBatch);
}
}
private ICollection<SourceFile> GetSourceFileBatch(int batchSize, Status status)
=> SourceFiles
.Where(sf => sf.Status == status)
.Take(batchSize)
.ToList();
//set status to in progress for all files in the batch
//and save the changes to database
//in the application this is actually done with a bulk update and the method is async
private void SetInProgressStatusForBatch(IEnumerable<SourceFile> fileBatch)
{
foreach (var file in fileBatch)
{
file.Status = Status.InProgress;
var sourceFile = SourceFiles.First(sf => sf.Id == file.Id);
sourceFile.Status = file.Status;
}
}
//set status and exception messages for all files in the batch
//and save the changes to database
//in the application this is actually done with a bulk update and the method is async
private void UpdateSourceFiles(IEnumerable<SourceFile> fileBatch)
{
foreach (var file in fileBatch)
{
var sourceFile = SourceFiles.First(sf => sf.Id == file.Id);
sourceFile.Status = file.Status;
sourceFile.ExceptionMessage = file.ExceptionMessage;
}
}
private void ProcessFile(SourceFile file)
{
try
{
//do something ...
Thread.Sleep(20);
file.Status = Status.Success;
file.ExceptionMessage = null;
}
catch (Exception ex)
{
file.Status = Status.Failed;
file.ExceptionMessage = ex.Message;
}
}
}
public class SourceFile
{
public int Id { get; set; }
public string Path { get; set; }
public Status Status { get; set; }
public string ExceptionMessage { get; set; }
}
public enum Status
{
Pending,
InProgress,
Success,
Failed,
}
}
使用系统;
使用System.Collections.Generic;
使用系统诊断;
使用System.Linq;
使用系统线程;
名称空间控制台EAPP1
{
内部静态类程序
{
私有静态void Main()
{
Console.WriteLine(“处理文件”);
var stopWatch=新秒表();
秒表。开始();
var fileService=new fileService();
ProcessPendingFiles();
foreach(fileService.SourceFiles中的var sourceFile)
{
WriteLine($“{sourceFile.Id}{sourceFile.Status}”);
}
控制台写入线(秒表已过);
Console.ReadLine();
}
}
公共类文件服务
{
私有常量int BatchSize=100;
私有常数int DegreeOfParallelism=10;
//此SourceFiles属性替换实际存储数据的Sqlite数据库
公共ICollection源文件=
可枚举
.范围(0,1000)
.选择(i=>
新源文件
{
Id=i,
Path=“源文件路径”,
状态=状态。挂起,
})
.ToList();
public void ProcessPendingFiles()
{
while(true)
{
var fileBatch=GetSourceFileBatch(BatchSize,Status.Pending);
如果(fileBatch.Count==0)
返回;
SetInProgressStatusForBatch(fileBatch);
文件批处理
.天冬酰胺()
.带平行度(平行度)
.ForAll(进程文件);
更新资源文件(fileBatch);
}
}
私有ICollection GetSourceFileBatch(int batchSize,状态)
=>源文件
.Where(sf=>sf.Status==Status)
.Take(批量大小)
.ToList();
//将批处理中所有文件的状态设置为“进行中”
//并将更改保存到数据库
//在应用程序中,这实际上是通过批量更新完成的,方法是异步的
私有void SetInProgressStatusForBatch(IEnumerable fileBatch)
{
foreach(fileBatch中的var文件)
{
file.Status=Status.InProgress;
var sourceFile=SourceFiles.First(sf=>sf.Id==file.Id);
sourceFile.Status=file.Status;
}
}
//为批处理中的所有文件设置状态和异常消息
//并将更改保存到数据库
//在应用程序中,这实际上是通过批量更新完成的,方法是异步的
私有void更新资源文件(IEnumerable fileBatch)
{
foreach(fileBatch中的var文件)
{
var sourceFile=SourceFiles.First(sf=>sf.Id==file.Id);
sourceFile.Status=file.Status;
sourceFile.ExceptionMessage=file.ExceptionMessage;
}
}
私有void进程文件(源文件)
{
尝试
{
//做点什么。。。
睡眠(20);
public class YourCode
{
private BlockingCollection<object> queue = new BlockingCollection<object>();
public YourCode()
{
var thread = new Thread(StartConsuming);
thread.IsBackground = true;
thread.Start();
}
public void Produce(object item)
{
queue.Add(item);
}
private void StartConsuming()
{
while (true)
{
object item = queue.Take();
// Add your code to process the item here.
// Do not start another task or thread.
}
}
}
class WorkController
{
private DataSourceExportConfig _dataSourceExportConfig;
private SourceFileService _sourceFileService;
private string destinationBase;
public async Task CopyPendingFilesAsync(SourcePath sourcePath, Options options)
{
await Task.WhenAll(Enumerable.Range(0, 10).Select(x => Worker(sourcePath, options)));
}
public async Task Worker(SourcePath sourcePath, Options options)
{
SourceFile file = null;
while (_sourceFileService.GetNextFile(out file))
{
ProcessFile(file, destinationBase, options);
}
}
private void ProcessFile(SourceFile file, string destinationBase, Options options)
{
}
}
public class ProcessFilesFlow
{
private TransformBlock<SourcePath, IEnumerable<SourceFile>> _getSourceFileBatch;
private TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>> _setStatusToProcessing;
private TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>> _processFiles;
private ActionBlock<IEnumerable<SourceFile>> _setStatusToComplete;
public ProcessFilesFlow()
{
//Setup options
//All of these options and more can be tuned for throughput
var getSourceFileBatchOptions = new ExecutionDataflowBlockOptions()
{
BoundedCapacity = 10, //How many source paths to queue at one time
MaxDegreeOfParallelism = 10, //How many source paths to get batches for at one time
EnsureOrdered = false //Process batches as soon as ready
};
var setStatusToProcessingOptions = new ExecutionDataflowBlockOptions()
{
BoundedCapacity = 10, //How many batches to queue at one time
MaxDegreeOfParallelism = 10, //Unlimited, how many batches to updates status for
EnsureOrdered = false //Process batches as soon as ready
};
var processFilesOptions = new ExecutionDataflowBlockOptions()
{
BoundedCapacity = 10, //Batches to queue at one time
MaxDegreeOfParallelism = 10, //Batches to work on at the same time
EnsureOrdered = false //Process batches as soon as ready
};
var setStatusToCompleteOptions = new ExecutionDataflowBlockOptions()
{
BoundedCapacity = 10, //Batches to queue at one time
MaxDegreeOfParallelism = 10, //Batches to update at once
EnsureOrdered = false //Process batches as soon as ready
};
//Build the dataflow pipeline
_getSourceFileBatch = new TransformBlock<SourcePath, IEnumerable<SourceFile>>(path => GetSourceFileBatch(path), getSourceFileBatchOptions);
_setStatusToProcessing = new TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>>(batch => SetStatusToProcessingAsync(batch), setStatusToProcessingOptions);
_processFiles = new TransformBlock<IEnumerable<SourceFile>, IEnumerable<SourceFile>>(batch => ProcessFiles(batch), processFilesOptions);
_setStatusToComplete = new ActionBlock<IEnumerable<SourceFile>>(batch => SetStatusToCompleteAsync(batch), setStatusToCompleteOptions);
//Link the pipeline
_getSourceFileBatch.LinkTo(_setStatusToProcessing, new DataflowLinkOptions() { PropagateCompletion = true });
_setStatusToProcessing.LinkTo(_processFiles, new DataflowLinkOptions() { PropagateCompletion = true });
_processFiles.LinkTo(_setStatusToComplete, new DataflowLinkOptions() { PropagateCompletion = true });
}
public async Task ProcessAll(IEnumerable<SourcePath> sourcePaths)
{
foreach(var path in sourcePaths)
{
await _getSourceFileBatch.SendAsync(path);
}
_getSourceFileBatch.Complete();
await _setStatusToComplete.Completion;
}
private IEnumerable<SourceFile> GetSourceFileBatch(SourcePath sourcePath)
{
//Get batch of files based on sourcePath
return Enumerable.Empty<SourceFile>();
}
private async Task<IEnumerable<SourceFile>> SetStatusToProcessingAsync(IEnumerable<SourceFile> sourceFiles)
{
//Update file status
foreach (var file in sourceFiles)
await file.UpdateStatusAsync("In Progress");
return sourceFiles;
}
private IEnumerable<SourceFile> ProcessFiles(IEnumerable<SourceFile> sourceFiles)
{
//process files
foreach (var file in sourceFiles)
file.Process();
return sourceFiles;
}
private async Task SetStatusToCompleteAsync(IEnumerable<SourceFile> sourceFiles)
{
//Update file status
foreach (var file in sourceFiles)
await file.UpdateStatusAsync("Completed");
}
}