C# 从HTML源代码中动态生成的链接下载文件_C#_Ssis 2012_Webclient Download

C# 从HTML源代码中动态生成的链接下载文件

C# 从HTML源代码中动态生成的链接下载文件,c#,ssis-2012,webclient-download,C#,Ssis 2012,Webclient Download,我正试图从澳大利亚获取天气数据。手动方式是转到并单击“所有年份的数据”，然后下载文件以下是我尝试将其自动化的内容： using (WebClient client = new WebClient()) { string html = client.DownloadString("http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display

我正试图从澳大利亚获取天气数据。手动方式是转到并单击“所有年份的数据”，然后下载文件

以下是我尝试将其自动化的内容：

using (WebClient client = new WebClient())
            {

                string html = client.DownloadString("http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");


                List<string> list = LinkExtractor.Extract(html);
                foreach (var link in list)
                {
                    if (link.StartsWith("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile"))
                    {

                        string resource = "http://www.bom.gov.au" + link;
                        MessageBox.Show(resource);


                        client.DownloadFileAsync(new Uri(resource), Dts.Connections["data.zip"].ConnectionString);
                        break;
                    }
                }




            }

使用（WebClient=newWebClient（））
{
字符串html=客户端。下载字符串（“http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");
List=LinkExtractor.Extract（html）；
foreach（列表中的var链接）
{
if（link.StartsWith（“/jsp/ncc/cdio/weatherData/av？p_display_type=dailyZippedDataFile”））
{
字符串资源=”http://www.bom.gov.au“+链接；
MessageBox.Show（资源）；
client.DownloadFileAsync（新Uri（资源），Dts.Connections[“data.zip”].ConnectionString）；
打破
}
}
}

不要担心linkExtractor，它的工作原理是我能够看到提供文件的链接。问题是“DownloadFileAsync”创建了一个新请求，该请求不允许下载该文件，因为该文件需要相同的会话

我有办法做到这一点吗？请寻求更多的澄清

更新：

以下是我利用HttpWebRequest中的cookies所做的更改。但是，我仍然无法下载该文件

HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");
            request.CookieContainer = new CookieContainer();

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();

            foreach (Cookie cook in response.Cookies)
            {
                MessageBox.Show(cook.ToString());
            }

            if (response.StatusCode == HttpStatusCode.OK)
           {
                Stream receiveStream = response.GetResponseStream();
                StreamReader readStream = null;

                if (response.CharacterSet == null)
                {
                    readStream = new StreamReader(receiveStream);
                }
                else
                {
                    readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
                }

                string data = readStream.ReadToEnd();



                using (WebClient client = new WebClient())
                {
                    foreach (Cookie cook in response.Cookies)
                    {
                        MessageBox.Show(cook.ToString());
                        client.Headers.Add(HttpRequestHeader.Cookie, cook.ToString());
                    }

                    List<string> list = LinkExtractor.Extract(data);
                    foreach (var link in list)
                    {
                        if (link.StartsWith("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile"))
                        {

                            string initial = "http://www.bom.gov.au" + link;
                            MessageBox.Show(initial);

                            //client.Headers.Add(HttpRequestHeader.Cookie, "JSESSIONID=2EBAFF7EFE2EEFE8140118CE5170B8F6");
                            client.DownloadFile(new Uri(initial), Dts.Connections["data.zip"].ConnectionString);
                            break;
                        }
                    }




                }

                response.Close();
                readStream.Close();
            }

HttpWebRequest请求=（HttpWebRequest）WebRequest.Create（“http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064");
request.CookieContainer=新的CookieContainer（）；
HttpWebResponse=（HttpWebResponse）request.GetResponse（）；
foreach（Cookie-cook-in-response.Cookies）
{
Show（cook.ToString（））；
}
if（response.StatusCode==HttpStatusCode.OK）
{
Stream receiveStream=response.GetResponseStream（）；
StreamReader readStream=null；
if（response.CharacterSet==null）
{
readStream=新的StreamReader（receiveStream）；
}
其他的
{
readStream=newstreamreader（receiveStream，Encoding.GetEncoding（response.CharacterSet））；
}
字符串数据=readStream.ReadToEnd（）；
使用（WebClient=newWebClient（））
{
foreach（Cookie-cook-in-response.Cookies）
{
Show（cook.ToString（））；
client.Headers.Add（HttpRequestHeader.Cookie，cook.ToString（））；
}
List=LinkExtractor.Extract（数据）；
foreach（列表中的var链接）
{
if（link.StartsWith（“/jsp/ncc/cdio/weatherData/av？p_display_type=dailyZippedDataFile”））
{
字符串首字母=”http://www.bom.gov.au“+链接；
MessageBox.Show（首字母）；
//Add（HttpRequestHeader.Cookie，“JSESSIONID=2EBAFF7EFE2EEFE8140118CE5170B8F6”）；
client.DownloadFile（新Uri（初始），Dts.Connections[“data.zip”].ConnectionString）；
打破
}
}
}
response.Close（）；
readStream.Close（）；
}

您得到的html和其中的url都是html编码的。这使得当你将url从html中分串出来时，理想情况下你需要对其进行解码。这是zip的下载url的样子：

有助手类为我们进行解码：

此代码确实下载了zip文件：

using (var client = new WebClient())
{
    var url = "http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064";    
    string html = client.DownloadString(url);

    var pos = html.IndexOf("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile");
    var endpos = html.IndexOf('"', pos);
    string link = html.Substring(pos, endpos - pos);

    var decodedLink = WebUtility.HtmlDecode(link);
    string resource = "http://www.bom.gov.au" + decodedLink;                    


    client.DownloadFile(new Uri(resource), @"c:\temp\bom2.zip");

}

在这种情况下，您不需要保留cookie，但需要小心解析URL。

您能否详细说明使用cookie会有什么帮助，因为浏览网站不需要用户凭据？因为有些网站关心自己的内容，并采取一些措施防止容易被刮取。有些可能需要会话cookie，有些在每次GET时生成唯一的URL，有些需要引用，有些运行javascript并执行几个ajax请求。如果您能够使用浏览器成功下载文件，您只需模仿即可。webclient不会自己做这件事。使用浏览器的开发人员控制台来确定后续http调用中需要什么。当我单击下载文件时，控制台会显示这一点：资源被解释为文档，但使用MIME类型application/zip传输：“。您需要查看“网络”选项卡并研究请求和响应标题…我可以看到请求cookie。如何将第一个链接的请求cookie设置为第二个链接的请求cookie。“CookieAwareWebClient”示例对我不起作用

using (var client = new WebClient())
{
    var url = "http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=&p_c=&p_stn_num=2064";    
    string html = client.DownloadString(url);

    var pos = html.IndexOf("/jsp/ncc/cdio/weatherData/av?p_display_type=dailyZippedDataFile");
    var endpos = html.IndexOf('"', pos);
    string link = html.Substring(pos, endpos - pos);

    var decodedLink = WebUtility.HtmlDecode(link);
    string resource = "http://www.bom.gov.au" + decodedLink;                    


    client.DownloadFile(new Uri(resource), @"c:\temp\bom2.zip");

}