C# 从HTML源代码中动态生成的链接下载文件
我正试图从澳大利亚获取天气数据。手动方式是转到并单击“所有年份的数据”,然后下载文件 以下是我尝试将其自动化的内容:C# 从HTML源代码中动态生成的链接下载文件,c#,ssis-2012,webclient-download,C#,Ssis 2012,Webclient Download,我正试图从澳大利亚获取天气数据。手动方式是转到并单击“所有年份的数据”,然后下载文件 以下是我尝试将其自动化的内容: using (WebClient client = new WebClient()) { string html = client.DownloadString("http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display
using (WebClient client = new WebClient())
{
string html = client.DownloadString("http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");
List<string> list = LinkExtractor.Extract(html);
foreach (var link in list)
{
if (link.StartsWith("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile"))
{
string resource = "http://www.bom.gov.au" + link;
MessageBox.Show(resource);
client.DownloadFileAsync(new Uri(resource), Dts.Connections["data.zip"].ConnectionString);
break;
}
}
}
使用(WebClient=newWebClient())
{
字符串html=客户端。下载字符串(“http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");
List=LinkExtractor.Extract(html);
foreach(列表中的var链接)
{
if(link.StartsWith(“/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile”))
{
字符串资源=”http://www.bom.gov.au“+链接;
MessageBox.Show(资源);
client.DownloadFileAsync(新Uri(资源),Dts.Connections[“data.zip”].ConnectionString);
打破
}
}
}
不要担心linkExtractor,它的工作原理是我能够看到提供文件的链接。问题是“DownloadFileAsync”创建了一个新请求,该请求不允许下载该文件,因为该文件需要相同的会话
我有办法做到这一点吗?请寻求更多的澄清
更新:
以下是我利用HttpWebRequest中的cookies所做的更改。但是,我仍然无法下载该文件
HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");
request.CookieContainer = new CookieContainer();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
foreach (Cookie cook in response.Cookies)
{
MessageBox.Show(cook.ToString());
}
if (response.StatusCode == HttpStatusCode.OK)
{
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = null;
if (response.CharacterSet == null)
{
readStream = new StreamReader(receiveStream);
}
else
{
readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
}
string data = readStream.ReadToEnd();
using (WebClient client = new WebClient())
{
foreach (Cookie cook in response.Cookies)
{
MessageBox.Show(cook.ToString());
client.Headers.Add(HttpRequestHeader.Cookie, cook.ToString());
}
List<string> list = LinkExtractor.Extract(data);
foreach (var link in list)
{
if (link.StartsWith("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile"))
{
string initial = "http://www.bom.gov.au" + link;
MessageBox.Show(initial);
//client.Headers.Add(HttpRequestHeader.Cookie, "JSESSIONID=2EBAFF7EFE2EEFE8140118CE5170B8F6");
client.DownloadFile(new Uri(initial), Dts.Connections["data.zip"].ConnectionString);
break;
}
}
}
response.Close();
readStream.Close();
}
HttpWebRequest请求=(HttpWebRequest)WebRequest.Create(“http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");
request.CookieContainer=新的CookieContainer();
HttpWebResponse=(HttpWebResponse)request.GetResponse();
foreach(Cookie-cook-in-response.Cookies)
{
Show(cook.ToString());
}
if(response.StatusCode==HttpStatusCode.OK)
{
Stream receiveStream=response.GetResponseStream();
StreamReader readStream=null;
if(response.CharacterSet==null)
{
readStream=新的StreamReader(receiveStream);
}
其他的
{
readStream=newstreamreader(receiveStream,Encoding.GetEncoding(response.CharacterSet));
}
字符串数据=readStream.ReadToEnd();
使用(WebClient=newWebClient())
{
foreach(Cookie-cook-in-response.Cookies)
{
Show(cook.ToString());
client.Headers.Add(HttpRequestHeader.Cookie,cook.ToString());
}
List=LinkExtractor.Extract(数据);
foreach(列表中的var链接)
{
if(link.StartsWith(“/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile”))
{
字符串首字母=”http://www.bom.gov.au“+链接;
MessageBox.Show(首字母);
//Add(HttpRequestHeader.Cookie,“JSESSIONID=2EBAFF7EFE2EEFE8140118CE5170B8F6”);
client.DownloadFile(新Uri(初始),Dts.Connections[“data.zip”].ConnectionString);
打破
}
}
}
response.Close();
readStream.Close();
}
您得到的html和其中的url都是html编码的。这使得当你将url从html中分串出来时,理想情况下你需要对其进行解码。这是zip的下载url的样子:
有助手类为我们进行解码:
此代码确实下载了zip文件:
using (var client = new WebClient())
{
var url = "http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064";
string html = client.DownloadString(url);
var pos = html.IndexOf("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile");
var endpos = html.IndexOf('"', pos);
string link = html.Substring(pos, endpos - pos);
var decodedLink = WebUtility.HtmlDecode(link);
string resource = "http://www.bom.gov.au" + decodedLink;
client.DownloadFile(new Uri(resource), @"c:\temp\bom2.zip");
}
在这种情况下,您不需要保留cookie,但需要小心解析URL。您能否详细说明使用cookie会有什么帮助,因为浏览网站不需要用户凭据?因为有些网站关心自己的内容,并采取一些措施防止容易被刮取。有些可能需要会话cookie,有些在每次GET时生成唯一的URL,有些需要引用,有些运行javascript并执行几个ajax请求。如果您能够使用浏览器成功下载文件,您只需模仿即可。webclient不会自己做这件事。使用浏览器的开发人员控制台来确定后续http调用中需要什么。当我单击下载文件时,控制台会显示这一点:资源被解释为文档,但使用MIME类型application/zip传输:“。您需要查看“网络”选项卡并研究请求和响应标题…我可以看到请求cookie。如何将第一个链接的请求cookie设置为第二个链接的请求cookie。“CookieAwareWebClient”示例对我不起作用
using (var client = new WebClient())
{
var url = "http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064";
string html = client.DownloadString(url);
var pos = html.IndexOf("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile");
var endpos = html.IndexOf('"', pos);
string link = html.Substring(pos, endpos - pos);
var decodedLink = WebUtility.HtmlDecode(link);
string resource = "http://www.bom.gov.au" + decodedLink;
client.DownloadFile(new Uri(resource), @"c:\temp\bom2.zip");
}