C# 下载多个文件的更快方法
我需要从SEC网站下载大约200万个文件。每个文件都有一个唯一的url,平均为10kB。这是我当前的实现:C# 下载多个文件的更快方法,c#,.net,browser,C#,.net,Browser,我需要从SEC网站下载大约200万个文件。每个文件都有一个唯一的url,平均为10kB。这是我当前的实现: List<string> urls = new List<string>(); // ... initialize urls ... WebBrowser browser = new WebBrowser(); foreach (string url in urls) { browser.Navigate(ur
List<string> urls = new List<string>();
// ... initialize urls ...
WebBrowser browser = new WebBrowser();
foreach (string url in urls)
{
browser.Navigate(url);
while (browser.ReadyState != WebBrowserReadyState.Complete) Application.DoEvents();
StreamReader sr = new StreamReader(browser.DocumentStream);
StreamWriter sw = new StreamWriter(), url.Substring(url.LastIndexOf('/')));
sw.Write(sr.ReadToEnd());
sr.Close();
sw.Close();
}
void Main(void)
{
ServicePointManager.DefaultConnectionLimit = 10000;
List<string> urls = new List<string>();
// ... initialize urls ...
int retries = urls.AsParallel().WithDegreeOfParallelism(8).Sum(arg => downloadFile(arg));
}
public int downloadFile(string url)
{
int retries = 0;
retry:
try
{
HttpWebRequest webrequest = (HttpWebRequest)WebRequest.Create(url);
webrequest.Timeout = 10000;
webrequest.ReadWriteTimeout = 10000;
webrequest.Proxy = null;
webrequest.KeepAlive = false;
webresponse = (HttpWebResponse)webrequest.GetResponse();
using (Stream sr = webrequest.GetResponse().GetResponseStream())
using (FileStream sw = File.Create(url.Substring(url.LastIndexOf('/'))))
{
sr.CopyTo(sw);
}
}
catch (Exception ee)
{
if (ee.Message != "The remote server returned an error: (404) Not Found." && ee.Message != "The remote server returned an error: (403) Forbidden.")
{
if (ee.Message.StartsWith("The operation has timed out") || ee.Message == "Unable to connect to the remote server" || ee.Message.StartsWith("The request was aborted: ") || ee.Message.StartsWith("Unable to read data from the transport connection: ") || ee.Message == "The remote server returned an error: (408) Request Timeout.") retries++;
else MessageBox.Show(ee.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
goto retry;
}
}
return retries;
}
List url=new List();
// ... 初始化URL。。。
WebBrowser browser=新的WebBrowser();
foreach(url中的字符串url)
{
浏览器.导航(url);
while(browser.ReadyState!=WebBrowserReadyState.Complete)Application.DoEvents();
StreamReader sr=新的StreamReader(browser.DocumentStream);
StreamWriter sw=newstreamwriter(),url.Substring(url.LastIndexOf('/'));
sw.Write(sr.ReadToEnd());
高级关闭();
sw.Close();
}
预计时间约为12天。。。有没有更快的办法
编辑:顺便说一句,本地文件处理只需要7%的时间
编辑:这是我的最终实现:
List<string> urls = new List<string>();
// ... initialize urls ...
WebBrowser browser = new WebBrowser();
foreach (string url in urls)
{
browser.Navigate(url);
while (browser.ReadyState != WebBrowserReadyState.Complete) Application.DoEvents();
StreamReader sr = new StreamReader(browser.DocumentStream);
StreamWriter sw = new StreamWriter(), url.Substring(url.LastIndexOf('/')));
sw.Write(sr.ReadToEnd());
sr.Close();
sw.Close();
}
void Main(void)
{
ServicePointManager.DefaultConnectionLimit = 10000;
List<string> urls = new List<string>();
// ... initialize urls ...
int retries = urls.AsParallel().WithDegreeOfParallelism(8).Sum(arg => downloadFile(arg));
}
public int downloadFile(string url)
{
int retries = 0;
retry:
try
{
HttpWebRequest webrequest = (HttpWebRequest)WebRequest.Create(url);
webrequest.Timeout = 10000;
webrequest.ReadWriteTimeout = 10000;
webrequest.Proxy = null;
webrequest.KeepAlive = false;
webresponse = (HttpWebResponse)webrequest.GetResponse();
using (Stream sr = webrequest.GetResponse().GetResponseStream())
using (FileStream sw = File.Create(url.Substring(url.LastIndexOf('/'))))
{
sr.CopyTo(sw);
}
}
catch (Exception ee)
{
if (ee.Message != "The remote server returned an error: (404) Not Found." && ee.Message != "The remote server returned an error: (403) Forbidden.")
{
if (ee.Message.StartsWith("The operation has timed out") || ee.Message == "Unable to connect to the remote server" || ee.Message.StartsWith("The request was aborted: ") || ee.Message.StartsWith("Unable to read data from the transport connection: ") || ee.Message == "The remote server returned an error: (408) Request Timeout.") retries++;
else MessageBox.Show(ee.Message, "Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
goto retry;
}
}
return retries;
}
void主管道(void)
{
ServicePointManager.DefaultConnectionLimit=10000;
列表URL=新列表();
//…初始化URL。。。
int retries=url.aspallel().WithDegreeOfParallelism(8).Sum(arg=>downloadFile(arg));
}
公共int下载文件(字符串url)
{
int重试次数=0;
重试:
尝试
{
HttpWebRequest webrequest=(HttpWebRequest)webrequest.Create(url);
webrequest.Timeout=10000;
webrequest.ReadWriteTimeout=10000;
webrequest.Proxy=null;
webrequest.KeepAlive=false;
webresponse=(HttpWebResponse)webrequest.GetResponse();
使用(Stream sr=webrequest.GetResponse().GetResponseStream())
使用(FileStream sw=File.Create(url.Substring(url.LastIndexOf('/')))
{
高级文书主任(西南);
}
}
捕获(异常ee)
{
if(ee.Message!=“远程服务器返回错误:(404)未找到。”&&ee.Message!=“远程服务器返回错误:(403)禁止。”)
{
if(ee.Message.StartsWith(“操作已超时”)| ee.Message==“无法连接到远程服务器”| ee.Message.StartsWith(“请求已中止”)| ee.Message.StartsWith(“无法从传输连接读取数据:”)| ee.Message==“远程服务器返回错误:(408)请求超时。”)重试++;
else MessageBox.Show(ee.Message,“Error”,MessageBoxButtons.OK,MessageBoxIcon.Error);
转到重试;
}
}
返回重试次数;
}
以多个线程下载文件。线程数取决于吞吐量。另外,请看一看课程。简单示例:
var list = new[]
{
"http://google.com",
"http://yahoo.com",
"http://stackoverflow.com"
};
var tasks = Parallel.ForEach(list,
s =>
{
using (var client = new WebClient())
{
Console.WriteLine($"starting to download {s}");
string result = client.DownloadString((string)s);
Console.WriteLine($"finished downloading {s}");
}
});
同时执行下载,而不是按顺序执行,并设置合理的MaxDegreeOfParallelism,否则您将尝试同时发出过多请求,这看起来像是DOS攻击:
public static void Main(string[] args)
{
var urls = new List<string>();
Parallel.ForEach(
urls,
new ParallelOptions{MaxDegreeOfParallelism = 10},
DownloadFile);
}
public static void DownloadFile(string url)
{
using(var sr = new StreamReader(HttpWebRequest.Create(url)
.GetResponse().GetResponseStream()))
using(var sw = new StreamWriter(url.Substring(url.LastIndexOf('/'))))
{
sw.Write(sr.ReadToEnd());
}
}
publicstaticvoidmain(字符串[]args)
{
var url=新列表();
并行ForEach(
网址,
新的并行选项{MaxDegreeOfParallelism=10},
下载文件);
}
公共静态void下载文件(字符串url)
{
使用(var sr=newstreamreader)(HttpWebRequest.Create(url)
.GetResponse().GetResponseStream())
使用(var sw=newstreamwriter(url.Substring(url.LastIndexOf('/')))
{
sw.Write(sr.ReadToEnd());
}
}
我会使用多个线程并行运行,并使用一个网络客户端。我建议将最大并行度设置为所需的线程数,因为未指定的并行度对于长时间运行的任务不起作用。我在我的一个项目中使用了50次并行下载,没有任何问题,但是根据单个下载的速度,低得多可能就足够了
如果您从同一台服务器并行下载多个文件,默认情况下,您只能并行下载少量文件(2或4)。虽然http标准规定了这样一个下限,但许多服务器并不强制执行。使用ServicePointManager.DefaultConnectionLimit=10000
以增加限制。这些文件不能合并到一个归档文件中并在一个单元中下载吗?您使用浏览器控件而不是WebRequest
?@CodeInChaos原因是我不知道它们之间的差异…@CodeInChaos我用一个连续的WebRequest测试过,实际上是30%的速度,我觉得很可疑。您正在使用来自多个线程的共享浏览器实例。从另一个线程调用Application.DoEvents
也可能是错误的。@CodeInChaos同意,我专注于并行性而没有考虑下载实现。将修复…现在已修复,用HttpWebRequest替换了浏览器控件谢谢,使用此方法(也使用ServicePointManager.DefaultConnectionLimit=10000;)我想这是由于服务器限制。还有什么建议吗?我怀疑瓶颈是服务器上每个客户端(IP地址)的并发连接数。如果您知道将MaxDegreeOfParallelism设置为匹配的是什么,那么这将获胜;t通过增加,但将阻止等待连接的请求。如果您有可以扩展的资源,则可以获得更高的吞吐量,即在n个客户端之间拆分URL,每个客户端具有不同的IP地址以同时运行。确实,ServicePointManager.DefaultConnectionLimit=10000;事实证明,要获得高于2的加速比,这一点至关重要。这里唯一缺少的是设置MaxDegreeOfParallelism。OP声明了200万个文件,因此如果没有它,上面的操作将排队处理200万个工作项,并为服务器允许和/或处理的并发请求让路。最好将其限制为目标服务器的每个客户端的最大连接数。