如何从c#中的html代码中获取html代码的一部分?
在我的程序中,我使用了字符串变量内容。我已经为这个字符串分配了一个小的HTML程序。比如说,如何从c#中的html代码中获取html代码的一部分?,c#,html,C#,Html,在我的程序中,我使用了字符串变量内容。我已经为这个字符串分配了一个小的HTML程序。比如说, String content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href="mailto:support@yourcompany.com
String content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href="mailto:support@yourcompany.com">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";
String content=“这里是您的标题,这是一个中等标题,请发送邮件至。这是一个没有段落中断的新句子。”;
从这里我想得到“这是一个中等标题
发邮件给我support@yourcompany.com.This是一个没有断句的新句子
此字符串在标记中可用。如何使用c#获取此字符串。不要使用字符串方法或正则表达式来解析HTML。你可以用 完整样本 HtmlFormatHelper.cs:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace Tools
{
/// <summary>
/// набор утилит для форматирования HTML текста
/// </summary>
public static class HtmlFormatHelper
{
private static Regex _regexLineBreak;
private static Regex _regexStripFormatting;
private static Regex _regexTagWhiteSpace;
private static Regex _regexHyperlink;
/// <summary>
/// статический конструктор
/// </summary>
static HtmlFormatHelper()
{
_regexLineBreak = new Regex(@"<(br|BR|p|P)\s{0,1}\/{0,1}>\s*|</[pP]>", RegexOptions.Singleline);
_regexStripFormatting = new Regex(@"<[^>]*(>|$)", RegexOptions.Singleline);
_regexTagWhiteSpace = new Regex(@"(>|$)(\W|\n|\r)+<", RegexOptions.Singleline);
_regexHyperlink = new Regex(@"<a\s+[^>]*href\s*=\s*[""']?([^""'>]+)[""']?[^>]*>([^<]+)</a>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
}
/// <summary>
/// конвертировать HTML в текст
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string HtmlToPlainText(string html)
{
var text = html;
text = System.Net.WebUtility.HtmlDecode(text);
text = _regexTagWhiteSpace.Replace(text, "><");
text = _regexLineBreak.Replace(text, Environment.NewLine);
text = _regexStripFormatting.Replace(text, string.Empty);
return text;
}
/// <summary>
/// конвертировать HTML в текст с "умным" оформлением
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string HtmlToPlainTextSmart(string html)
{
// обрабатываем ссылки
html = _regexHyperlink.Replace(html, e =>
{
string url = e.Groups[1].Value.Trim();
string text = e.Groups[2].Value.Trim();
if (url.Length == 0 || string.Equals(url, text, StringComparison.InvariantCultureIgnoreCase))
{
// ссылки идентичны или ссылка отсутствует
return e.Value;
}
else
{
// ссылки отличаются
return string.Format("{0} ({1})", text, url);
}
});
return HtmlToPlainText(html);
}
/// <summary>
/// кодировать HTML код с "мягком" режиме
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string SoftHtmlEncode(string html)
{
if (html == null)
{
return null;
}
else
{
StringBuilder sb = new StringBuilder(html.Length);
foreach (char c in html)
{
if (c == '<')
{
sb.Append("<");
}
else if (c == '>')
{
sb.Append(">");
}
else
{
sb.Append(c);
}
}
return sb.ToString();
}
}
}
}
// input string
string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:support@yourcompany.com\">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";
// extract html body
string htmlBody = Regex.Match(content, @"^.*?<body>(.*)</body>.*?$", RegexOptions.IgnoreCase).Groups[1].Value;
// plain text
string plainText = Tools.HtmlFormatHelper.HtmlToPlainText(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com.This is a new sentence without a paragraph break.
// plain text (with url in brackets)
string plainTextSmart = Tools.HtmlFormatHelper.HtmlToPlainTextSmart(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com (mailto:support@yourcompany.com).This is a new sentence without a paragraph break.
使用系统;
使用System.Collections.Generic;
使用System.Linq;
使用系统文本;
使用System.Text.RegularExpressions;
名称空间工具
{
///
///аааааааааааааааааа107
///
公共静态类HtmlFormatHelper
{
私有静态正则表达式(regexLineBreak);
私有静态正则表达式(regexStripFormatting);
私有静态regexu regexTagWhiteSpace;
私有静态Regex_regexHyperlink;
///
/// статический конструктор
///
静态HtmlFormatHelper()
{
_regexLineBreak=newregex(@“\s*|”,RegexOptions.Singleline);
_regexStripFormatting=newregex(@“]*(>|$)”,RegexOptions.Singleline);
_regexTagWhiteSpace=newregex(@“(>|$)(\W| \n| \r)+永远不要使用正则表达式进行HTML匹配或解析,在某些情况下,它比使用外部库更有效。不,永远不要。阅读文章。它不仅效率更低,关键是它在许多情况下根本不适用于正则表达式。如果你发现了下一个边缘情况,没有人愿意维护正则表达式库。我想这是一个艰难的时刻兆字节
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace Tools
{
/// <summary>
/// набор утилит для форматирования HTML текста
/// </summary>
public static class HtmlFormatHelper
{
private static Regex _regexLineBreak;
private static Regex _regexStripFormatting;
private static Regex _regexTagWhiteSpace;
private static Regex _regexHyperlink;
/// <summary>
/// статический конструктор
/// </summary>
static HtmlFormatHelper()
{
_regexLineBreak = new Regex(@"<(br|BR|p|P)\s{0,1}\/{0,1}>\s*|</[pP]>", RegexOptions.Singleline);
_regexStripFormatting = new Regex(@"<[^>]*(>|$)", RegexOptions.Singleline);
_regexTagWhiteSpace = new Regex(@"(>|$)(\W|\n|\r)+<", RegexOptions.Singleline);
_regexHyperlink = new Regex(@"<a\s+[^>]*href\s*=\s*[""']?([^""'>]+)[""']?[^>]*>([^<]+)</a>", RegexOptions.Singleline | RegexOptions.IgnoreCase);
}
/// <summary>
/// конвертировать HTML в текст
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string HtmlToPlainText(string html)
{
var text = html;
text = System.Net.WebUtility.HtmlDecode(text);
text = _regexTagWhiteSpace.Replace(text, "><");
text = _regexLineBreak.Replace(text, Environment.NewLine);
text = _regexStripFormatting.Replace(text, string.Empty);
return text;
}
/// <summary>
/// конвертировать HTML в текст с "умным" оформлением
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string HtmlToPlainTextSmart(string html)
{
// обрабатываем ссылки
html = _regexHyperlink.Replace(html, e =>
{
string url = e.Groups[1].Value.Trim();
string text = e.Groups[2].Value.Trim();
if (url.Length == 0 || string.Equals(url, text, StringComparison.InvariantCultureIgnoreCase))
{
// ссылки идентичны или ссылка отсутствует
return e.Value;
}
else
{
// ссылки отличаются
return string.Format("{0} ({1})", text, url);
}
});
return HtmlToPlainText(html);
}
/// <summary>
/// кодировать HTML код с "мягком" режиме
/// </summary>
/// <param name="html"> HTML </param>
/// <returns></returns>
public static string SoftHtmlEncode(string html)
{
if (html == null)
{
return null;
}
else
{
StringBuilder sb = new StringBuilder(html.Length);
foreach (char c in html)
{
if (c == '<')
{
sb.Append("<");
}
else if (c == '>')
{
sb.Append(">");
}
else
{
sb.Append(c);
}
}
return sb.ToString();
}
}
}
}
// input string
string content = "<HTML> <HEAD> <TITLE>Your Title Here</TITLE></HEAD> <BODY><H2>This is a Medium Header Send me mail at<a href=\"mailto:support@yourcompany.com\">support@yourcompany.com</a>.This is a new sentence without a paragraph break.</H2></BODY></HTML>";
// extract html body
string htmlBody = Regex.Match(content, @"^.*?<body>(.*)</body>.*?$", RegexOptions.IgnoreCase).Groups[1].Value;
// plain text
string plainText = Tools.HtmlFormatHelper.HtmlToPlainText(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com.This is a new sentence without a paragraph break.
// plain text (with url in brackets)
string plainTextSmart = Tools.HtmlFormatHelper.HtmlToPlainTextSmart(htmlBody);
//: This is a Medium Header Send me mail atsupport@yourcompany.com (mailto:support@yourcompany.com).This is a new sentence without a paragraph break.