如何从c#中的html代码中获取html代码的一部分?

如何从c#中的html代码中获取html代码的一部分?,c#,html,C#,Html,在我的程序中,我使用了字符串变量内容。我已经为这个字符串分配了一个小的HTML程序。比如说, String content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href="mailto:support@yourcompany.com

在我的程序中,我使用了字符串变量内容。我已经为这个字符串分配了一个小的HTML程序。比如说,

String content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href="mailto:support@yourcompany.com">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";
String content=“这里是您的标题,这是一个中等标题,请发送邮件至。这是一个没有段落中断的新句子。”;
从这里我想得到“这是一个中等标题 发邮件给我support@yourcompany.com.This是一个没有断句的新句子


此字符串在标记中可用。如何使用c#获取此字符串。

不要使用字符串方法或正则表达式来解析HTML。你可以用

完整样本 HtmlFormatHelper.cs:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace Tools
{
    /// <summary>
    /// набор утилит для форматирования HTML текста
    /// </summary>
    public static class HtmlFormatHelper
    {
        private static Regex _regexLineBreak;
        private static Regex _regexStripFormatting;
        private static Regex _regexTagWhiteSpace;
        private static Regex _regexHyperlink;

        /// <summary>
        /// статический конструктор
        /// </summary>
        static HtmlFormatHelper()
        {
            _regexLineBreak = new Regex(@"<(br|BR|p|P)\s{0,1}\/{0,1}>\s*|</[pP]>", RegexOptions.Singleline);
            _regexStripFormatting = new Regex(@"<[^>]*(>|$)", RegexOptions.Singleline);
            _regexTagWhiteSpace = new Regex(@"(>|$)(\W|\n|\r)+<", RegexOptions.Singleline);
            _regexHyperlink = new Regex(@"<a\s+[^>]*href\s*=\s*[""']?([^""'>]+)[""']?[^>]*>([^<]+)</a>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
        }

        /// <summary>
        /// конвертировать HTML в текст
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string HtmlToPlainText(string html)
        {
            var text = html;

            text = System.Net.WebUtility.HtmlDecode(text);
            text = _regexTagWhiteSpace.Replace(text, "><");
            text = _regexLineBreak.Replace(text, Environment.NewLine);
            text = _regexStripFormatting.Replace(text, string.Empty);

            return text;
        }

        /// <summary>
        /// конвертировать HTML в текст с "умным" оформлением
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string HtmlToPlainTextSmart(string html)
        {
            // обрабатываем ссылки
            html = _regexHyperlink.Replace(html, e =>
            {
                string url = e.Groups[1].Value.Trim();
                string text = e.Groups[2].Value.Trim();

                if (url.Length == 0 || string.Equals(url, text, StringComparison.InvariantCultureIgnoreCase))
                {
                    // ссылки идентичны или ссылка отсутствует
                    return e.Value;
                }
                else
                {
                    // ссылки отличаются
                    return string.Format("{0} ({1})", text, url);
                }
            });

            return HtmlToPlainText(html);
        }

        /// <summary>
        /// кодировать HTML код с "мягком" режиме
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string SoftHtmlEncode(string html)
        {
            if (html == null)
            {
                return null;
            }
            else
            {
                StringBuilder sb = new StringBuilder(html.Length);

                foreach (char c in html)
                {
                    if (c == '<')
                    {
                        sb.Append("&lt;");
                    }
                    else if (c == '>')
                    {
                        sb.Append("&gt;");
                    }
                    else
                    {
                        sb.Append(c);
                    }
                }

                return sb.ToString();
            }
        }
    }
}
// input string
string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:support@yourcompany.com\">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";

// extract html body
string htmlBody = Regex.Match(content, @"^.*?<body>(.*)</body>.*?$", RegexOptions.IgnoreCase).Groups[1].Value;

// plain text
string plainText = Tools.HtmlFormatHelper.HtmlToPlainText(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com.This is a new sentence without a paragraph break.

// plain text (with url in brackets)
string plainTextSmart = Tools.HtmlFormatHelper.HtmlToPlainTextSmart(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com (mailto:support@yourcompany.com).This is a new sentence without a paragraph break.
使用系统;
使用System.Collections.Generic;
使用System.Linq;
使用系统文本;
使用System.Text.RegularExpressions;
名称空间工具
{
/// 
///аааааааааааааааааа107
/// 
公共静态类HtmlFormatHelper
{
私有静态正则表达式(regexLineBreak);
私有静态正则表达式(regexStripFormatting);
私有静态regexu regexTagWhiteSpace;
私有静态Regex_regexHyperlink;
/// 
/// статический конструктор
/// 
静态HtmlFormatHelper()
{
_regexLineBreak=newregex(@“\s*|”,RegexOptions.Singleline);
_regexStripFormatting=newregex(@“]*(>|$)”,RegexOptions.Singleline);

_regexTagWhiteSpace=newregex(@“(>|$)(\W| \n| \r)+永远不要使用正则表达式进行HTML匹配或解析,在某些情况下,它比使用外部库更有效。不,永远不要。阅读文章。它不仅效率更低,关键是它在许多情况下根本不适用于正则表达式。如果你发现了下一个边缘情况,没有人愿意维护正则表达式库。我想这是一个艰难的时刻兆字节
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespace Tools
{
    /// <summary>
    /// набор утилит для форматирования HTML текста
    /// </summary>
    public static class HtmlFormatHelper
    {
        private static Regex _regexLineBreak;
        private static Regex _regexStripFormatting;
        private static Regex _regexTagWhiteSpace;
        private static Regex _regexHyperlink;

        /// <summary>
        /// статический конструктор
        /// </summary>
        static HtmlFormatHelper()
        {
            _regexLineBreak = new Regex(@"<(br|BR|p|P)\s{0,1}\/{0,1}>\s*|</[pP]>", RegexOptions.Singleline);
            _regexStripFormatting = new Regex(@"<[^>]*(>|$)", RegexOptions.Singleline);
            _regexTagWhiteSpace = new Regex(@"(>|$)(\W|\n|\r)+<", RegexOptions.Singleline);
            _regexHyperlink = new Regex(@"<a\s+[^>]*href\s*=\s*[""']?([^""'>]+)[""']?[^>]*>([^<]+)</a>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
        }

        /// <summary>
        /// конвертировать HTML в текст
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string HtmlToPlainText(string html)
        {
            var text = html;

            text = System.Net.WebUtility.HtmlDecode(text);
            text = _regexTagWhiteSpace.Replace(text, "><");
            text = _regexLineBreak.Replace(text, Environment.NewLine);
            text = _regexStripFormatting.Replace(text, string.Empty);

            return text;
        }

        /// <summary>
        /// конвертировать HTML в текст с "умным" оформлением
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string HtmlToPlainTextSmart(string html)
        {
            // обрабатываем ссылки
            html = _regexHyperlink.Replace(html, e =>
            {
                string url = e.Groups[1].Value.Trim();
                string text = e.Groups[2].Value.Trim();

                if (url.Length == 0 || string.Equals(url, text, StringComparison.InvariantCultureIgnoreCase))
                {
                    // ссылки идентичны или ссылка отсутствует
                    return e.Value;
                }
                else
                {
                    // ссылки отличаются
                    return string.Format("{0} ({1})", text, url);
                }
            });

            return HtmlToPlainText(html);
        }

        /// <summary>
        /// кодировать HTML код с "мягком" режиме
        /// </summary>
        /// <param name="html"> HTML </param>
        /// <returns></returns>
        public static string SoftHtmlEncode(string html)
        {
            if (html == null)
            {
                return null;
            }
            else
            {
                StringBuilder sb = new StringBuilder(html.Length);

                foreach (char c in html)
                {
                    if (c == '<')
                    {
                        sb.Append("&lt;");
                    }
                    else if (c == '>')
                    {
                        sb.Append("&gt;");
                    }
                    else
                    {
                        sb.Append(c);
                    }
                }

                return sb.ToString();
            }
        }
    }
}
// input string
string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:support@yourcompany.com\">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";

// extract html body
string htmlBody = Regex.Match(content, @"^.*?<body>(.*)</body>.*?$", RegexOptions.IgnoreCase).Groups[1].Value;

// plain text
string plainText = Tools.HtmlFormatHelper.HtmlToPlainText(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com.This is a new sentence without a paragraph break.

// plain text (with url in brackets)
string plainTextSmart = Tools.HtmlFormatHelper.HtmlToPlainTextSmart(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com (mailto:support@yourcompany.com).This is a new sentence without a paragraph break.