C# 如何仅刮除<；车身>；标记网站_C#_.net_Html Parsing_Web Scraping

C# 如何仅刮除<；车身>；标记网站

c# .net web-scraping

C# 如何仅刮除<；车身>；标记网站,c#,.net,html-parsing,web-scraping,C#,.net,Html Parsing,Web Scraping,我在做一个网络爬虫。此时，我刮取整个内容，然后使用正则表达式删除、和其他标记，获得正文的内容但是，我正在尝试优化性能，我想知道是否有一种方法可以只刮页面的 namespace WebScraper { public static class KrioScraper { public static string scrapeIt(string siteToScrape) { string HTML = getHTML

我在做一个网络爬虫。此时，我刮取整个内容，然后使用正则表达式删除

、

和其他标记，获得正文的内容

但是，我正在尝试优化性能，我想知道是否有一种方法可以只刮页面的

namespace WebScraper
{
    public static class KrioScraper
    {    
        public static string scrapeIt(string siteToScrape)
        {
            string HTML = getHTML(siteToScrape);
            string text = stripCode(HTML);
            return text;
        }

        public static string getHTML(string siteToScrape)
        {
            string response = "";
            HttpWebResponse objResponse;
            HttpWebRequest objRequest = 
                (HttpWebRequest) WebRequest.Create(siteToScrape);
            objRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; " +
                "Windows NT 5.1; .NET CLR 1.0.3705)";
            objResponse = (HttpWebResponse) objRequest.GetResponse();
            using (StreamReader sr =
                new StreamReader(objResponse.GetResponseStream()))
            {
                response = sr.ReadToEnd();
                sr.Close();
            }
            return response;
        }

        public static string stripCode(string the_html)
        {
            // Remove google analytics code and other JS
            the_html = Regex.Replace(the_html, "<script.*?</script>", "", 
                RegexOptions.Singleline | RegexOptions.IgnoreCase);
            // Remove inline stylesheets
            the_html = Regex.Replace(the_html, "<style.*?</style>", "", 
                RegexOptions.Singleline | RegexOptions.IgnoreCase);
            // Remove HTML tags
            the_html = Regex.Replace(the_html, "</?[a-z][a-z0-9]*[^<>]*>", "");
            // Remove HTML comments
            the_html = Regex.Replace(the_html, "<!--(.|\\s)*?-->", "");
            // Remove Doctype
            the_html = Regex.Replace(the_html, "<!(.|\\s)*?>", "");
            // Remove excessive whitespace
            the_html = Regex.Replace(the_html, "[\t\r\n]", " ");

            return the_html;
        }
    }
}

namespace-WebScraper
{
公共静态类KrioScraper
{    
公共静态字符串scrapeIt（字符串siteToScrape）
{
字符串HTML=getHTML（siteToScrape）；
字符串文本=条带代码（HTML）；
返回文本；
}
公共静态字符串getHTML（字符串siteToScrape）
{
字符串响应=”；
HttpWebResponse objResponse；
HttpWebRequest对象请求=
（HttpWebRequest）WebRequest.Create（siteToScrape）；
objRequest.UserAgent=“Mozilla/4.0（兼容；MSIE 6.0；”+
“Windows NT 5.1；.NET CLR 1.0.3705）”；
objResponse=（HttpWebResponse）objRequest.GetResponse（）；
使用（StreamReader sr=
新的StreamReader（objResponse.GetResponseStream（））
{
响应=sr.ReadToEnd（）；
高级关闭（）；
}
返回响应；
}
公共静态字符串stripCode（字符串为_html）
{
//删除谷歌分析代码和其他JS
Replace（the_html，“我建议利用来进行html解析/操作
您可以像这样轻松地选择主体：
var webGet = new HtmlWeb();
var document = webGet.Load(url);
document.DocumentNode.SelectSingleNode("//body")

仍然是最简单/最快（最不准确）的方法
int start=response.IndexOf（“我认为您最好的选择是使用一个轻量级HTML解析器（，根据我的测试，它比HTML Agility Pack快大约50-100%），并且只处理您感兴趣的节点（介于
和
之间的任何节点）.Majestic 12比HTML Agility Pack更难使用，但如果您希望获得性能，那么它肯定会帮助您
这将使您接近您所要求的内容，但您仍然需要下载整个页面。我认为没有办法解决这一问题。您将保存的内容实际上是为所有其他内容生成DOM节点（除了正文）。您必须对其进行分析，但可以跳过您不感兴趣处理的节点的全部内容

我没有一个现成的例子说明如何抓取身体，但我有一个例子说明如何只抓取链接，只需稍加修改就可以到达那里。以下是粗略的版本：
GrabBody(ParserTools.OpenM12Parser(_response.BodyBytes));

您需要打开M12解析器（M12附带的示例项目中有注释，详细说明了所有这些选项是如何影响性能的，它们确实如此！！！）：
解析正文：
public void GrabBody(HTMLparser parser)
{

    // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
    // because HTMLparser re-uses this object
    HTMLchunk chunk = null;

    // we parse until returned oChunk is null indicating we reached end of parsing
    while ((chunk = parser.ParseNext()) != null)
    {
        switch (chunk.oType)
        {
            // matched open tag, ie <a href="">
            case HTMLchunkType.OpenTag:
                if (chunk.sTag == "body")
                {
                    // Start generating the DOM node (as shown in the previous example link)
                }
                break;

            // matched close tag, ie </a>
            case HTMLchunkType.CloseTag:
                break;

            // matched normal text
            case HTMLchunkType.Text:
                break;

            // matched HTML comment, that's stuff between <!-- and -->
            case HTMLchunkType.Comment:
                break;
        };
    }
}

public-void-GrabBody（HTMLparser解析器）
{
//解析器将返回名为HTMLchunk的令牌——警告：在解析结束之前不要销毁它
//因为HTMLPasser重新使用了这个对象
HTMLchunk chunk=null；
//我们一直解析，直到返回的oChunk为null，这表明我们已经到达解析的末尾
while（（chunk=parser.ParseNext（））！=null）
{
开关（chunk.oType）
{
//匹配的开放标签，即
案例HTMLchunkType.CloseTag：
打破
//匹配正常文本
案例HTMLchunkType.Text：
打破
//匹配的HTML注释，这是
案例HTMLchunkType。注释：
打破
};
}
}

生成DOM节点很棘手，但正如我所说，这绝不等同于您在HTML agility pack中看到的3行程序，但是一旦您使用了这些工具，您将能够以性能成本的一小部分获得所需的内容，并且可能只需要同样多的代码行。当然，但我们需要查看您当前的scraping codeHey Joel，谢谢你花时间来帮助你。HtmlAgilityPack对我有什么帮助？我不需要先加载页面，然后解析字符串吗？agility pack可以为你加载和解析页面。我更新了我的示例。自己解析html可能是一件非常痛苦的事情，特别是如果它的格式不完美的话。agility pack是真实的我很擅长。agility pack需要先加载并解析页面，这会增加额外的开销。虽然这是一个简单而准确的解决方案，但它并不快速或高效。这一点很好。你只需要测试一下，看看它是否太慢，不符合你的需要。+1：很好。我不知道Majest 12。我必须检查一下。@Lirik:我想看一下还有，你说它更难，你能指出它有多不同吗？我看不到任何在线文档或示例。谢谢Lirik。唯一的问题是我找不到使用这个库的文档或API。你能给我指一个链接吗？如果有机会，你能用抓取t的例子更新你的答案吗身体？我很好奇它是如何工作的。@Johancho，不幸的是，文档有点稀疏，但它确实附带了一个示例项目，向您展示了基础知识是如何工作的。查看M12页面上的文件：我花了大约一天的时间才弄明白如何使用它，这是值得的（请注意，我以前没有HTML/DOM解析方面的经验）。+1用于添加一个简单快速的答案，以便快速完成工作。并非所有人都希望下载和部署框架，尤其是一次性使用。不确定为什么会否决此建议。
GrabBody(ParserTools.OpenM12Parser(_response.BodyBytes));

public static HTMLparser OpenM12Parser(byte[] buffer)
{
    HTMLparser parser = new HTMLparser();
    parser.SetChunkHashMode(false);
    parser.bKeepRawHTML = false;
    parser.bDecodeEntities = true;
    parser.bDecodeMiniEntities = true;

    if (!parser.bDecodeEntities && parser.bDecodeMiniEntities)
        parser.InitMiniEntities();

    parser.bAutoExtractBetweenTagsOnly = true;
    parser.bAutoKeepScripts = true;
    parser.bAutoMarkClosedTagsWithParamsAsOpen = true;
    parser.CleanUp();
    parser.Init(buffer);
    return parser;
}

public void GrabBody(HTMLparser parser)
{

    // parser will return us tokens called HTMLchunk -- warning DO NOT destroy it until end of parsing
    // because HTMLparser re-uses this object
    HTMLchunk chunk = null;

    // we parse until returned oChunk is null indicating we reached end of parsing
    while ((chunk = parser.ParseNext()) != null)
    {
        switch (chunk.oType)
        {
            // matched open tag, ie <a href="">
            case HTMLchunkType.OpenTag:
                if (chunk.sTag == "body")
                {
                    // Start generating the DOM node (as shown in the previous example link)
                }
                break;

            // matched close tag, ie </a>
            case HTMLchunkType.CloseTag:
                break;

            // matched normal text
            case HTMLchunkType.Text:
                break;

            // matched HTML comment, that's stuff between <!-- and -->
            case HTMLchunkType.Comment:
                break;
        };
    }
}