C# 4.0 正在尝试下载所有URL'；html格式的_C# 4.0

C# 4.0 正在尝试下载所有URL'；html格式的

c#-4.0

C# 4.0 正在尝试下载所有URL'；html格式的,c#-4.0,C# 4.0,有人能帮我处理这个代码吗？我正在尝试下载此html中的所有URL（它们都是pdf）我理解基本逻辑，我想我只是弄乱了正则表达式。这就是我到目前为止所做的： using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Net; using System.IO; using System.Text.RegularExpressions; namespace

有人能帮我处理这个代码吗？我正在尝试下载此html中的所有URL（它们都是pdf）

我理解基本逻辑，我想我只是弄乱了正则表达式。这就是我到目前为止所做的：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions; 

namespace DownloadPdfs
{
    class Program
    {
        static void Main(string[] args)
        {
            StringBuilder sb = new StringBuilder();

            byte[] buf = new byte[8192];

            HttpWebRequest request = (HttpWebRequest)
            WebRequest.Create("http://mises.org/books/");

            HttpWebResponse response = (HttpWebResponse)
                request.GetResponse();

            Stream resStream = response.GetResponseStream();

            string tempString = null;
            int count = 0;

            do
            {
                count = resStream.Read(buf, 0, buf.Length);

                if (count != 0)
                {
                    tempString = Encoding.ASCII.GetString(buf, 0, count);
                    sb.Append(tempString);
                }
            }
            while (count > 0); // any more data to read?

            string html = sb.ToString();
            List<string> listoflinks = new List<string>(); 

            string input = html; 
            Regex rx = new Regex(@"(?<="")[^""]+(?="")|[^\s""]\S*");
            for (Match match = rx.Match(input); match.Success; match = match.NextMatch())
            {
                listoflinks.Add(match.ToString()); 
            }

            foreach (var v in listoflinks)
            {
                using (WebClient Client = new WebClient())
                {
                    Client.DownloadFile(v,v);
                }
            }          
        }
    }
}

使用系统；
使用System.Collections.Generic；
使用System.Linq；
使用系统文本；
Net系统；
使用System.IO；
使用System.Text.RegularExpressions；
命名空间下载PDF
{
班级计划
{
静态void Main（字符串[]参数）
{
StringBuilder sb=新的StringBuilder（）；
字节[]buf=新字节[8192]；
HttpWebRequest请求=（HttpWebRequest）
WebRequest.Create（“http://mises.org/books/");
HttpWebResponse=（HttpWebResponse）
request.GetResponse（）；
Stream resStream=response.GetResponseStream（）；
字符串tempString=null；
整数计数=0；
做
{
count=resStream.Read（基本单位，0，基本单位长度）；
如果（计数！=0）
{
tempString=Encoding.ASCII.GetString（buf，0，count）；
某人附加（临时字符串）；
}
}
while（count>0）；//还有数据要读取吗？
字符串html=sb.ToString（）；
List listoflinks=新列表（）；
字符串输入=html；
Regex rx=new Regex（@“（？尝试下面的代码。该模式将与锚定的HREF
属性值匹配
Regex rx = new Regex(@"href=""(?<Url>[^.""]+\.pdf)""",RegexOptions.IgnoreCase | RegexOptions.Multiline);
for (Match match = rx.Match(input); match.Success; match = match.NextMatch())
{
    var link = match.Groups["Url"].Value;
    listoflinks.Add(link); 
}

Regex rx=newregex（@“href=”（？[^.”“]+\.pdf）”，RegexOptions.IgnoreCase | RegexOptions.Multiline）；
for（Match=rx.Match（输入）；Match.Success；Match=Match.NextMatch（））
{
var link=match.Groups[“Url”].Value；
添加（链接）；
}
使用一个库来解析类似HtmlAgilityPack的html
public List<string> GetLinks(string html)
{
    var htmlDoc = new HtmlDocument();
    htmlDoc.LoadHtml(html);

    var linkNodes = htmlDoc.DocumentNode.SelectNodes("//a[@href]");

    if (linkNodes == null)
    {
        return new List<string>();
    }

    var linkNodesWithLink = linkNodes.Where(x => x.Attributes.Contains("href")).ToList();

    var links = linkNodesWithLink.Select(x => x.Attributes["href"].Value)
        .Where(x => !string.IsNullOrWhiteSpace(x))
        .Select(x => x.Trim())
        .ToList();
    links = links.Distinct().ToList();
    return links;
}

公共列表获取链接（字符串html）
{
var htmlDoc=新的HtmlDocument（）；
htmlDoc.LoadHtml（html）；
var linkNodes=htmlDoc.DocumentNode.SelectNodes（“//a[@href]”）；
if（linkNodes==null）
{
返回新列表（）；
}
var linkNodesWithLink=linkNodes.Where（x=>x.Attributes.Contains（“href”））.ToList（）；
var links=linkNodesWithLink.Select（x=>x.Attributes[“href”].Value）
.Where（x=>！string.IsNullOrWhiteSpace（x））
.Select（x=>x.Trim（））
.ToList（）；
links=links.Distinct（）.ToList（）；
返回链接；
}
我可以建议您改用HTML Agility Pack吗？这将使您更容易找到原始HTML文档中的链接。这里的第一个示例演示如何检索所有链接：如果您使用的是HTML Agility Pack，请查看前面的问题：