C# 将项目添加到用于Parallel.ForEach c的ConcurrentBag#
我试图同时抓取几个URL。每个请求可能会向C# 将项目添加到用于Parallel.ForEach c的ConcurrentBag#,c#,multithreading,C#,Multithreading,我试图同时抓取几个URL。每个请求可能会向ConcurrentBag添加更多URL以进行爬网。目前,我有一段很糟糕的时间(真的)启动了一个新的Parallel.ForEach来处理任何新的URL 是否有任何方法可以添加到ConcurrentBag的内容中,以便并行。ForEach将看到其中有新项目,并继续迭代这些新项目 ConcurrentBag<LinkObject> URLSToCheck = new ConcurrentBag<LinkObject>(); whi
ConcurrentBag
添加更多URL以进行爬网。目前,我有一段很糟糕的时间(真的)启动了一个新的Parallel.ForEach
来处理任何新的URL
是否有任何方法可以添加到ConcurrentBag
的内容中,以便并行。ForEach
将看到其中有新项目,并继续迭代这些新项目
ConcurrentBag<LinkObject> URLSToCheck = new ConcurrentBag<LinkObject>();
while (true)
{
Parallel.ForEach(URLSToCheck, new ParallelOptions { MaxDegreeOfParallelism = 5 }, URL =>
{
Checker Checker = new Checker();
URLDownloadResult result = Checker.downloadFullURL(URL.destinationURL);
List<LinkObject> URLsToAdd = Checker.findInternalUrls(URL.sourceURL, result.html);
foreach (var URLToAdd in URLsToAdd)
{
URLSToCheck.Add(new LinkObject { sourceURL = URLToAdd.sourceURL, destinationURL = URLToAdd.destinationURL });
}
});
if(URLSToCheck.Count == 0)break;
}
ConcurrentBag URLSToCheck=new ConcurrentBag();
while(true)
{
ForEach(URLSToCheck,新的ParallelOptions{maxdegreeofpparallelism=5},URL=>
{
棋盘格=新棋盘格();
URLDownloadResult=Checker.downloadFullURL(URL.destinationURL);
List URLsToAdd=Checker.findInternalUrls(URL.sourceURL,result.html);
foreach(URLsToAdd中的变量URLToAdd)
{
添加(新链接对象{sourceURL=URLToAdd.sourceURL,destinationURL=URLToAdd.destinationURL});
}
});
如果(URLSToCheck.Count==0)中断;
}
数据流在这里很方便。使用它可以很好地完成:
// Capture the variable, so it can be used in the next block
ActionBlock<LinkObject> = actionBlock = null;
actionBlock = new ActionBlock<LinkObject>(URL =>
{
Checker Checker = new Checker();
URLDownloadResult result = Checker.downloadFullURL(URL.destinationURL);
List<LinkObject> URLsToAdd = Checker.findInternalUrls(URL.sourceURL, result.html);
URLsToAdd.ForEach(actionBlock.Post)
},new ExecutionDataflowBlockOptions {MaxDegreeOfParallelism = 5});
你可以看看
BlockingCollection提供了生产者/消费者模式的实现:生产者将添加到blocking集合中,而Parallel.ForEach将从集合中消费
为此,您必须为BlockingCollection实现一个自定义分区器(此处解释了原因:)
分割者:
class BlockingCollectionPartitioner<T> : Partitioner<T>
{
private BlockingCollection<T> _collection;
internal BlockingCollectionPartitioner(BlockingCollection<T> collection)
{
if (collection == null)
throw new ArgumentNullException("collection");
_collection = collection;
}
public override bool SupportsDynamicPartitions
{
get { return true; }
}
public override IList<IEnumerator<T>> GetPartitions(int partitionCount)
{
if (partitionCount < 1)
throw new ArgumentOutOfRangeException("partitionCount");
var dynamicPartitioner = GetDynamicPartitions();
return Enumerable.Range(0, partitionCount).Select(_ => dynamicPartitioner.GetEnumerator()).ToArray();
}
public override IEnumerable<T> GetDynamicPartitions()
{
return _collection.GetConsumingEnumerable();
}
}
完成要处理的URL后,调用URLSToCheck.CompleteAdding()
并行.ForEach应该自动停止。深入研究递归代码可能会有所帮助。这是一个典型的应用示例。顺便说一句,小心循环引用。谢谢,这真的很有帮助:-)如果其他人使用此安装,请通过NuGet安装Microsoft.Tpl.Dataflow
class BlockingCollectionPartitioner<T> : Partitioner<T>
{
private BlockingCollection<T> _collection;
internal BlockingCollectionPartitioner(BlockingCollection<T> collection)
{
if (collection == null)
throw new ArgumentNullException("collection");
_collection = collection;
}
public override bool SupportsDynamicPartitions
{
get { return true; }
}
public override IList<IEnumerator<T>> GetPartitions(int partitionCount)
{
if (partitionCount < 1)
throw new ArgumentOutOfRangeException("partitionCount");
var dynamicPartitioner = GetDynamicPartitions();
return Enumerable.Range(0, partitionCount).Select(_ => dynamicPartitioner.GetEnumerator()).ToArray();
}
public override IEnumerable<T> GetDynamicPartitions()
{
return _collection.GetConsumingEnumerable();
}
}
BlockingCollection<LinkObject> URLSToCheck = new BlockingCollection<LinkObject>();
Parallel.ForEach(
new BlockingCollectionPartitioner<LinkObject>(URLSToCheck),
new ParallelOptions { MaxDegreeOfParallelism = 5 }, URL =>
{
//....
});
URLSToCheck.Add(...)