如何使用C#任务库下载和处理一系列数据
我一直在努力设计一种使用C#和任务库最大化任务并行性的设计。虽然我对各种并行处理概念有一些概念(也阅读了有关该主题的多个StackOverflow问题),但我很难以一种连贯的方式组装所有东西来解决我的问题 我的问题有以下属性/规则:如何使用C#任务库下载和处理一系列数据,c#,.net,C#,.net,我一直在努力设计一种使用C#和任务库最大化任务并行性的设计。虽然我对各种并行处理概念有一些概念(也阅读了有关该主题的多个StackOverflow问题),但我很难以一种连贯的方式组装所有东西来解决我的问题 我的问题有以下属性/规则: 我想从HTTP连接中跨多个HTTP服务器/连接以“段”的形式下载一系列带有时间戳的数据 根据特定的HTTP服务,它将以不同的大小提供每个数据段。例如,在一个连接上,它可以为每个请求提供一个小时段(例如http://server1/getdata?year=202
- 我想从HTTP连接中跨多个HTTP服务器/连接以“段”的形式下载一系列带有时间戳的数据
- 根据特定的HTTP服务,它将以不同的大小提供每个数据段。例如,在一个连接上,它可以为每个请求提供一个小时段(例如http://server1/getdata?year=2020&month=1&day=1&hour=1"). 在另一个连接上,它可能提供每月段的数据(例如http://server2/getdata?year=2020&month=1"). 不可能从每月连接获取每小时数据,反之亦然
- 如果任何一台服务器出现故障或正忙于x个以上的连接,我希望在另一台服务器上重试
- 下载数据段后,需要将其处理为数据集结果。这种处理应该尽可能地并行化
- 当序列中按时间顺序排列的第一个片段到达时,我希望立即开始处理它,并按时间顺序处理每个后续片段(即,我不希望等待整个序列完成下载后再响应调用方)
public IEnumerable<object> RetrieveData(DateTime begin, DateTime end)
{
// Break the period up into the smallest segments allowed.
// In this case, we will create one segment for each hour between begin and end dates
var segments = new DataSegments(begin, end, IntervalTypeEnum.Hourly);
var cancelTokenSource = new CancellationTokenSource();
var cancelToken = cancelTokenSource.Token;
var tasks = new List<Task>();
// Start a number of tasks which are responsible for downloading segments
// until all segments are complete.
for (int i = 0; i < 3; i++)
{
var task = new Task(() =>
{
// Keep downloading segments until there are none left.
while (!segments.IsComplete && !cancelToken.IsCancellationRequested)
{
string errorMsg = string.Empty;
// Gets a list of connections available for downloading data
var connections = DataConnectionManager.GetConnectionQueue();
// Cycle through all the available connections until we successfully download
// a chunk.
Retry:
try
{
var connection = connections.Dequeue();
if (connection is MonthlyDataConnection)
{
List<Segment> list = segments.GetNext(SegmentType.Monthly);
DownloadAndProcessMonthlySegment(connection, chunk, cancelToken);
}
else if (connection is HourlyDataConnection)
{
List<Segment> list = segments.GetNext(SegmentType.Hourly);
foreach(var segment in list)
{
DownloadAndProcessHourlySegment(connection, segment, cancelToken);
}
}
}
catch
{
goto Retry;
}
}
});
task.Start();
tasks.Add(task);
}
foreach(var segment in segments)
{
segment.Wait(cancelToken);
if (chunk.Data != null && !cancelToken.IsCancellationRequested)
{
yield return chunk.Data;
}
}
Task.WaitAll(tasks.ToArray());
}
void DownloadAndProcessMonthlySegment(connection, segment, cancelToken)
{
// Download from http connection, throw exception if WebException.
// Process data if http download successful
// Mark all segments as complete/ready
}
void DownloadAndProcessHourlySegment(connection, segment, cancelToken)
{
// Download from http connection, throw exception if WebException.
// Process data if http download successful
// Mark segment as complete/ready
}
public enum SegmentType
{
NextAvailable,
Hourly,
Monthly
}
// Represents a series of data segments that need to be downloaded
// In this code example, it will have hourly segments that span the specified
// begin and end dates.
public class DataSegments: IEnumerable<DataSegment>
{
// Returns a list of segments that haven't been downloaded yet.
// Depending on the "SegmentType", it will return just one hourly segment or
// an entire month of hourly segments (SegmentType.Hourly)
public List<DataSegment> GetNext(SegmentType type = SegmentType.NextAvailable);
}
// Represents a segment of data that needs to be retrieved from the web
// and processed into "Data".
public class DataSegment
{
DateTime BeginDate { get; set; }
DateTime EndDate { get; set; }
// The processed data-set result
object Data { get; set; }
}
public IEnumerable RetrieveData(日期时间开始,日期时间结束)
{
//将周期分解为允许的最小段。
//在本例中,我们将在开始日期和结束日期之间为每小时创建一个段
var段=新的数据段(开始、结束、间隔类型enum.Hourly);
var cancelTokenSource=new CancellationTokenSource();
var cancelToken=cancelTokenSource.Token;
var tasks=新列表();
//启动一些负责下载段的任务
//直到所有分段完成。
对于(int i=0;i<3;i++)
{
变量任务=新任务(()=>
{
//继续下载片段,直到没有剩余的片段。
而(!segments.IsComplete&!cancelToken.IsCancellationRequested)
{
string errorMsg=string.Empty;
//获取可用于下载数据的连接列表
var connections=DataConnectionManager.GetConnectionQueue();
//循环浏览所有可用的连接,直到我们成功下载
//一大块。
重试:
尝试
{
var connection=connections.Dequeue();
如果(连接是MonthlyDataConnection)
{
List List=segments.GetNext(SegmentType.Monthly);
DownloadAndProcessMonthlySegment(连接、区块、取消令牌);
}
else if(连接为hourlydata连接)
{
List List=segments.GetNext(SegmentType.Hourly);
foreach(列表中的var段)
{
DownloadAndProcessHourlySegment(连接、段、取消令牌);
}
}
}
抓住
{
转到重试;
}
}
});
task.Start();
任务。添加(任务);
}
foreach(分段中的var段)
{
段。等待(取消令牌);
if(chunk.Data!=null&!cancelToken.IsCancellationRequested)
{
产生返回数据块;
}
}
Task.WaitAll(tasks.ToArray());
}
void downloads和processmonthlysegment(连接、段、取消令牌)
{
//从http连接下载,如果WebException,则引发异常。
//如果http下载成功,则处理数据
//将所有分段标记为已完成/准备就绪
}
void downloads和processhourlysegment(连接、段、取消令牌)
{
//从http连接下载,如果WebException,则引发异常。
//如果http下载成功,则处理数据
//将段标记为已完成/准备就绪
}
公共枚举段类型
{
下一个可用,
每小时,
月刊
}
//表示需要下载的一系列数据段
//在这个代码示例中,它将具有跨越指定时间段的每小时段
//开始和结束日期。
公共类数据段:IEnumerable
{
//返回尚未下载的段的列表。
//根据“分段类型”,它将仅返回一个小时分段或
//整月的每小时分段(分段类型.每小时)
公共列表GetNext(SegmentType=SegmentType.NextAvailable);
}
//表示需要从web检索的数据段
//并处理成“数据”。
公共类数据段
{
日期时间开始日期{get;set;}
DateTime EndDate{get;set;}
//处理后的数据集结果
对象数据{get;set;}
}
代码的工作原理是使用一系列类似于线程和循环的任务,直到下载并处理一系列段。根据连接类型(每月或每小时),它将相应地下载和处理数据(同时确保没有其他任务尝试下载相同范围的数据)
虽然代码确实(大部分)有效,但我觉得它不是最理想或最优雅的解决方案。例如,一个短消息是,这些任务可以
class Program
{
static Random random = new Random((int)DateTime.Now.Ticks);
static void Main(string[] args)
{
Connections.Instance.Enqueue(new Connection(IntervalTypeEnum.Hourly));
Connections.Instance.Enqueue(new Connection(IntervalTypeEnum.Daily));
var begin = new DateTime(2020, 1, 1);
var end = new DateTime(2020, 1, 5);
foreach (var download in Download(begin, end))
{
Console.WriteLine($"Final result: {download}");
}
Console.WriteLine("Press any key...");
Console.ReadKey();
}
public static IEnumerable<string> Download(DateTime begin, DateTime end)
{
var segments = new DataSegments(begin, end, IntervalTypeEnum.Hourly);
var cancelTokenSource = new CancellationTokenSource();
var cancelToken = cancelTokenSource.Token;
var taskList = new List<Task<object>>();
var tasks = new List<Task>();
for (int i = 0; i < 3; i++)
{
var task = new Task(() =>
{
while (!segments.IsComplete && !cancelToken.IsCancellationRequested)
{
string errorMsg = string.Empty;
var connection = Connections.GetNextAvailable();
var list = segments.GetNext(connection.IntervalType);
foreach (var segment in list)
{
GetSegment(connection, segment, cancelToken);
}
}
});
task.Start();
tasks.Add(task);
}
foreach (var segment in segments)
{
segment.Wait(cancelToken);
if (segment.Data != null && !cancelToken.IsCancellationRequested)
{
Console.WriteLine($"Yielding data: {segment.Data}");
yield return (string)segment.Data;
}
}
Task.WaitAll(tasks.ToArray());
}
static void GetSegment(Connection conn, DataSegment segment, CancellationToken token)
{
conn.WaitOne();
var result = conn.Download(segment.Begin, segment.End);
segment.Data = result;
ProcessSegment(segment, token);
conn.Release();
}
static void ProcessSegment(DataSegment segment, CancellationToken token)
{
Console.WriteLine($"Processing segment data: {segment.Data}");
for (DateTime d = segment.Begin; d < segment.End; d = d.AddHours(1))
{
for (int i = 0; i < 100; i++)
{
}
// Doing stuff..
}
segment.Status = DownloadStatusEnum.Done;
}
}
public class Connection
{
static Random random = new Random((int)DateTime.Now.Ticks);
public IntervalTypeEnum IntervalType { get; set; }
private SemaphoreSlim semaphore = new SemaphoreSlim(2);
public Connection(IntervalTypeEnum type)
{
IntervalType = type;
}
public void WaitOne()
{
semaphore.Wait();
}
public bool IsBusy
{
get
{
return semaphore.CurrentCount == 0;
}
}
public string Download(DateTime begin, DateTime end)
{
var data = $"{begin.ToString("yyyyMMdd hh:mm")} - {end.ToString("yyyyMMdd hh:mm")}";
Console.WriteLine($"Downloading {data}");
Thread.Sleep(random.Next(1000));
return data;
}
public void Release()
{
semaphore.Release();
}
}
public class Connections : Queue<Connection>
{
private static Connections instance = null;
public static Connections Instance
{
get
{
if (instance == null)
instance = new Connections();
return instance;
}
}
public static Connection GetNextAvailable()
{
Connection retVal = null;
foreach (var connection in Instance)
{
if (retVal == null) retVal = connection;
if (!connection.IsBusy)
{
retVal = connection;
break;
}
else
{
}
}
return retVal;
}
}
public enum DownloadStatusEnum
{
NeedsProcessing,
InProgress,
Done
}
public class DataSegment
{
public EventHandler OnStatusUpdate;
ManualResetEvent resetEvent = new ManualResetEvent(false);
public DataSegment(DateTime begin, DateTime end)
{
Begin = begin;
End = end;
Status = DownloadStatusEnum.NeedsProcessing;
Data = null;
}
public DateTime Begin { get; set; }
public DateTime End { get; set; }
private DownloadStatusEnum _status = DownloadStatusEnum.NeedsProcessing;
public DownloadStatusEnum Status
{
get
{
return _status;
}
set
{
_status = value;
Update();
}
}
public string Data { get; set; }
void Update()
{
// If the task is finished, then trigger anyone waiting..
if (Status == DownloadStatusEnum.Done) resetEvent.Set();
this.OnStatusUpdate?.Invoke(this, null);
}
public void Wait(CancellationToken token)
{
WaitHandle.WaitAny(
new[] { token.WaitHandle, resetEvent });
}
}
public enum ChunkType
{
NextAvailable,
Monthly
}
public enum IntervalTypeEnum
{
Hourly = 0,
Daily = 1,
}
public class DataSegments : IEnumerable<DataSegment>
{
protected List<DataSegment> chunkList = new List<DataSegment>();
protected HashSet<DataSegment> unprocessedList = new HashSet<DataSegment>();
protected HashSet<DataSegment> inProgressList = new HashSet<DataSegment>();
protected HashSet<DataSegment> completedList = new HashSet<DataSegment>();
public DataSegments(DateTime begin, DateTime end, IntervalTypeEnum intervalType)
{
BeginDate = begin;
EndDate = end;
IntervalType = intervalType;
DateTime requestDate = BeginDate;
DateTime endDate = new DateTime(EndDate.Year, EndDate.Month, EndDate.Day, EndDate.Hour,
EndDate.Minute, EndDate.Second);
DateTime finalRequestDate = EndDate;
DateTime beginPeriod = BeginDate;
DateTime endPeriod = DateTime.MinValue;
if (IntervalType == IntervalTypeEnum.Hourly)
{
beginPeriod = new DateTime(beginPeriod.Year, beginPeriod.Month, beginPeriod.Day, beginPeriod.Hour, 0, 0);
endPeriod = beginPeriod.AddHours(1);
requestDate = new DateTime(requestDate.Year, requestDate.Month, requestDate.Day, requestDate.Hour, 0, 0);
finalRequestDate = endDate.AddHours(1);
}
else if (IntervalType == IntervalTypeEnum.Daily)
{
beginPeriod = new DateTime(beginPeriod.Year, beginPeriod.Month, beginPeriod.Day, 0, 0, 0);
endPeriod = beginPeriod.AddDays(1);
requestDate = new DateTime(requestDate.Year, requestDate.Month, beginPeriod.Day, 0, 0, 0);
// Calculate the last request date as the end day of the month
finalRequestDate = new DateTime(endDate.Year, endDate.Month, beginPeriod.Day, 23, 0, 0);
}
while (endPeriod <= finalRequestDate)
{
var chunk = new DataSegment(beginPeriod < BeginDate ? BeginDate : beginPeriod, endPeriod > EndDate ? EndDate : endPeriod.AddTicks(-1));
chunk.OnStatusUpdate += OnStatusUpdated;
chunkList.Add(chunk);
unprocessedList.Add(chunk);
if (IntervalType == IntervalTypeEnum.Hourly)
{
beginPeriod = beginPeriod.AddHours(1);
endPeriod = beginPeriod.AddHours(1);
}
else if (IntervalType == IntervalTypeEnum.Daily)
{
beginPeriod = beginPeriod.AddMonths(1);
endPeriod = beginPeriod.AddMonths(1);
}
}
}
void OnStatusUpdated(object sender, EventArgs args)
{
if (sender is DataSegment)
{
var dc = (DataSegment)sender;
if (dc.Status == DownloadStatusEnum.NeedsProcessing)
{
lock (unprocessedList)
{
unprocessedList.Add(dc);
inProgressList.Remove(dc);
completedList.Remove(dc);
}
}
else if (dc.Status == DownloadStatusEnum.InProgress)
{
lock (unprocessedList)
{
unprocessedList.Remove(dc);
inProgressList.Add(dc);
completedList.Remove(dc);
}
}
else if (dc.Status == DownloadStatusEnum.Done)
{
lock (unprocessedList)
{
unprocessedList.Remove(dc);
inProgressList.Remove(dc);
completedList.Add(dc);
}
}
}
}
public IntervalTypeEnum IntervalType { get; set; }
public DateTime BeginDate { get; set; }
public DateTime EndDate { get; set; }
public int UnprocessedCount
{
get
{
lock (chunkList)
{
return unprocessedList.Count;
}
}
}
/// <summary>
/// Determines whether the
/// </summary>
public bool IsComplete
{
get
{
return chunkList.Count == completedList.Count;
}
}
public List<DataSegment> GetNext(IntervalTypeEnum type)
{
List<DataSegment> retVal = new List<DataSegment>();
lock (unprocessedList)
{
DataSegment firstSegment = null;
bool adding = false;
int watermark = -1;
foreach (var chunk in unprocessedList)
{
//if (chunk.Status == DownloadStatusEnum.NeedsProcessing)
{
// Grab the first available chunk. If we don't find anything else that suits,
// we will just return this.
if (firstSegment == null) firstSegment = chunk;
if (type == IntervalTypeEnum.Hourly)
{
Console.WriteLine("Reserving HOURLY segment for download");
break;
}
else if (type == IntervalTypeEnum.Daily)
{
// IF we are at the start of a month, then add these
// to our list until we progress to the next month.
// We take a note of the current month so we know when we have
// moved to the next.
if (!adding)
{
adding = true;
watermark = chunk.Begin.Day;
retVal.Add(chunk);
}
else if (adding && chunk.Begin.Day != watermark)
{
Console.WriteLine("Reserving DAILY segment for download");
break;
}
else
{
retVal.Add(chunk);
}
}
}
}
// If we didn't find any matching chunk, return the first one.
if (retVal.Count == 0 && firstSegment != null) retVal.Add(firstSegment);
} // lock
// Mark all the chunks as in progress
foreach (var chunk in retVal)
{
chunk.Status = DownloadStatusEnum.InProgress;
}
return retVal;
}
public IEnumerator<DataSegment> GetEnumerator()
{
return chunkList.GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
}