ASP.NET正在尝试从字符串中删除html
所以我有一个cms,用户可以通过cuteeditor输入内容,它工作正常,然后在我的网站上显示这些数据。一件很少发生但令人恼火的事情是,用户在文本中输入某些标记,这使得字体看起来与页面上的其他字体不同,即ASP.NET正在尝试从字符串中删除html,asp.net,Asp.net,所以我有一个cms,用户可以通过cuteeditor输入内容,它工作正常,然后在我的网站上显示这些数据。一件很少发生但令人恼火的事情是,用户在文本中输入某些标记,这使得字体看起来与页面上的其他字体不同,即 <span style="font-size: 11pt">Special Olympics Ireland provides year round sports training and athletic competition in a variety of
<span style="font-size: 11pt">Special Olympics Ireland provides year round sports training and athletic competition in a variety of Olympic type sports for persons with intellectual disabilities in </span><span style="font-size: 11pt">Ireland</span><span style="font-size: 11pt"> and </span><span style="font-size: 11pt">Northern Ireland</span><span style="font-size: 11pt"> in accordance with and furtherance of the mission, goal and founding principles of the international Special Olympics movement.</span>
Special Olympics Ireland根据国际特殊奥林匹克运动的使命、目标和创立原则,为爱尔兰和北爱尔兰的智障人士提供全年的各种奥林匹克运动项目的体育训练和竞技比赛。
基本上我想做的是
String.Replace("<span style="font-size: 11pt">","")
String.Replace(“,”)
当然,这只会在下次使用8、9或10号字体时捕捉到上述情况,所以过滤方法必须像那样智能
有什么想法吗
所以现在我有点像
testSpan=Regex.Replace(testSpan,@“\s]+)?)+\s*\s*)/?>,String.Empty)
但是它基本上去除了所有的html我只想去除标记你真的应该用a来处理这类事情。我在VB.NET中使用了一个函数来去除字符串中的html:
Public Shared Function StripHTML(ByVal htmlString As String) As String
Dim pattern As String = "<(.|\n)*?>"
Return Regex.Replace(htmlString, pattern, String.Empty)
End Function
公共共享函数StripHTML(ByVal htmlString作为字符串)作为字符串
将图案变暗为字符串=“”
返回Regex.Replace(htmlString、pattern、String.Empty)
端函数
希望它能帮助你如果你想学习StackOverflow的例子,你可以创建一个允许的HTML标签的白名单,并去掉其余的标签 以下是Jeff Atwood用于清理和平衡StackOverflow用户生成内容中HTML标记的代码片段
- 消毒
- 平衡
- 允许的标签列表
/// <summary>
/// Provides some static extension methods for processing strings with HTML in them.
/// </summary>
public static class HtmlStripper
{
#region Sanitize
private static readonly Regex Tags = new Regex("<[^>]*(>|$)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture |
RegexOptions.Compiled);
private static readonly Regex Whitelist =
new Regex(
@"
^</?(b(lockquote)?|code|d(d|t|l|el)|em|h(1|2|3)|i|kbd|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)>$|
^<(b|h)r\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled |
RegexOptions.IgnorePatternWhitespace);
private static readonly Regex WhitelistA =
new Regex(
@"
^<a\s
href=""(\#\d+|(https?|ftp)://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+)""
(\stitle=""[^""<>]+"")?(\starget=""[^""<>]+"")?\s?>$|
^</a>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled |
RegexOptions.IgnorePatternWhitespace);
private static readonly Regex WhitelistImg =
new Regex(
@"
^<img\s
src=""https?://[-a-z0-9+&@#/%?=~_|!:,.;\(\)]+""
(\swidth=""\d{1,3}"")?
(\sheight=""\d{1,3}"")?
(\salt=""[^""<>]*"")?
(\stitle=""[^""<>]*"")?
\s?/?>$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled |
RegexOptions.IgnorePatternWhitespace);
/// <summary>
/// sanitize any potentially dangerous tags from the provided raw HTML input using
/// a whitelist based approach, leaving the "safe" HTML tags
/// CODESNIPPET:4100A61A-1711-4366-B0B0-144D1179A937
/// </summary>
/// <remarks>
/// Based on Jeff Atwood's code, found at http://refactormycode.com/codes/333-sanitize-html
/// Since Jeff Atwood is StackOverflow's administrator, this is most likely the code used by
/// that site. See http://meta.stackoverflow.com/questions/1777/what-html-tags-are-allowed
/// for a list of allowed tags.
/// </remarks>
public static string SanitizeHtml(string html)
{
if (String.IsNullOrEmpty(html)) return html;
// match every HTML tag in the input
MatchCollection tags = Tags.Matches(html);
for (int i = tags.Count - 1; i > -1; i--)
{
Match tag = tags[i];
string tagname = tag.Value.ToLowerInvariant();
if (!(Whitelist.IsMatch(tagname) || WhitelistA.IsMatch(tagname) || WhitelistImg.IsMatch(tagname)))
{
html = html.Remove(tag.Index, tag.Length);
}
}
return html;
}
#endregion
#region Balance tags
private static readonly Regex Namedtags = new Regex
(@"</?(?<tagname>\w+)[^>]*(\s|$|>)",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
/// <summary>
/// attempt to balance HTML tags in the html string
/// by removing any unmatched opening or closing tags
/// IMPORTANT: we *assume* HTML has *already* been
/// sanitized and is safe/sane before balancing!
///
/// CODESNIPPET: A8591DBA-D1D3-11DE-947C-BA5556D89593
/// </summary>
/// <remarks>
/// From Jeff Atwood's post at
/// http://refactormycode.com/codes/360-balance-html-tags
/// </remarks>
public static string BalanceTags(string html)
{
if (String.IsNullOrEmpty(html)) return html;
// convert everything to lower case; this makes
// our case insensitive comparisons easier
MatchCollection tags = Namedtags.Matches(html.ToLowerInvariant());
// no HTML tags present? nothing to do; exit now
int tagcount = tags.Count;
if (tagcount == 0) return html;
const string ignoredtags = "<p><img><br><li><hr>";
var tagpaired = new bool[tagcount];
var tagremove = new bool[tagcount];
// loop through matched tags in forward order
for (int ctag = 0; ctag < tagcount; ctag++)
{
string tagname = tags[ctag].Groups["tagname"].Value;
// skip any already paired tags
// and skip tags in our ignore list; assume they're self-closed
if (tagpaired[ctag] || ignoredtags.Contains("<" + tagname + ">")) continue;
string tag = tags[ctag].Value;
int match = -1;
if (tag.StartsWith("</"))
{
// this is a closing tag
// search backwards (previous tags), look for opening tags
for (int ptag = ctag - 1; ptag >= 0; ptag--)
{
string prevtag = tags[ptag].Value;
if (!tagpaired[ptag] && prevtag.Equals("<" + tagname, StringComparison.InvariantCulture))
{
// minor optimization; we do a simple possibly incorrect match above
// the start tag must be <tag> or <tag{space} to match
if (prevtag.StartsWith("<" + tagname + ">") || prevtag.StartsWith("<" + tagname + " "))
{
match = ptag;
break;
}
}
}
}
else
{
// this is an opening tag
// search forwards (next tags), look for closing tags
for (int ntag = ctag + 1; ntag < tagcount; ntag++)
{
if (!tagpaired[ntag] &&
tags[ntag].Value.Equals("</" + tagname + ">", StringComparison.InvariantCulture))
{
match = ntag;
break;
}
}
}
// we tried, regardless, if we got this far
tagpaired[ctag] = true;
if (match == -1) tagremove[ctag] = true; // mark for removal
else tagpaired[match] = true; // mark paired
}
// loop through tags again, this time in reverse order
// so we can safely delete all orphaned tags from the string
for (int ctag = tagcount - 1; ctag >= 0; ctag--)
{
if (tagremove[ctag])
{
html = html.Remove(tags[ctag].Index, tags[ctag].Length);
}
}
return html;
}
#endregion
}
//
///提供一些静态扩展方法,用于处理包含HTML的字符串。
///
公共静态类HtmlStripper
{
#区域消毒
私有静态只读正则表达式标记=新正则表达式(“]*(>|$)”,
RegexOptions.Singleline | RegexOptions.ExplicitCapture|
RegexOptions.Compiled);
私有静态只读Regex白名单=
新正则表达式(
@"
^$|
^$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled|
RegexOptions.ignorepattern(空格);
私有静态只读正则表达式WhitelistA=
新正则表达式(
@"
^$",
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled|
RegexOptions.ignorepattern(空格);
私有静态只读正则表达式WhitelistImg=
新正则表达式(
@"
^]*"")?
(\stitle=”“[^”“]*”“)是否可以?
\s?/?>$”,
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled|
RegexOptions.ignorepattern(空格);
///
///使用以下命令从提供的原始HTML输入中清除任何潜在危险的标记:
///基于白名单的方法,保留“安全”HTML标记
///代码段:4100A61A-1711-4366-B0B0-144D1179A937
///
///
///根据Jeff Atwood的代码,在http://refactormycode.com/codes/333-sanitize-html
///由于Jeff Atwood是StackOverflow的管理员,这很可能是
///那个网站,看到了吗http://meta.stackoverflow.com/questions/1777/what-html-tags-are-allowed
///查看允许的标记列表。
///
公共静态字符串SanitizeHtml(字符串html)
{
if(String.IsNullOrEmpty(html))返回html;
//匹配输入中的每个HTML标记
MatchCollection标记=tags.Matches(html);
对于(inti=tags.Count-1;i>-1;i--)
{
匹配标签=标签[i];
字符串标记名=tag.Value.ToLowerInvariant();
if(!(白名单.IsMatch(标记名)| |白名单塔.IsMatch(标记名)| |白名单img.IsMatch(标记名)))
{
html=html.Remove(tag.Index,tag.Length);
}
}
返回html;
}
#端区
#区域平衡标签
私有静态只读正则表达式Namedtags=新正则表达式
(@“]*(\s |$|>)”,
RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);
///
///尝试在HTML字符串中平衡HTML标记
///通过删除任何不匹配的开始或结束标记
///重要提示:我们*假设*HTML*已经*被使用
///在进行平衡之前,已消毒且安全/神智正常!
///
///代码片段:A8591DBA-D1D3-11DE-947C-BA5556D89593
///
///
///来自杰夫·阿特伍德在
/// http://refactormycode.com/codes/360-balance-html-tags
///
公共静态字符串平衡标记(字符串html)
{
if(String.IsNullOrEmpty(html))返回html;
//将所有内容转换为小写;这会使
//我们不区分大小写的比较更容易
MatchCollection tags=Namedtags.Matches(html.ToLowerInvariant());
//不存在HTML标记?无需执行任何操作;立即退出
int tagcount=tags.Count;
如果(tagcount==0)返回html;
常量字符串ignoredtags=“
”;
var tagpaired=新bool[tagcount];
var tagremove=新bool[tagcount];
//按正向顺序循环匹配的标记
对于(int-ctag=0;ctag 如果(tag.StartsWith(“)用于此特定情况,您可以执行如下操作
String input = @"<span style=""font-size: 11pt"">Special Olympics Ireland provides year round sports training and athletic competition in a variety of Olympic type sports for persons with intellectual disabilities in </span><span style=""font-size: 11pt"">Ireland</span><span style=""font-size: 11pt""> and </span><span style=""font-size: 11pt"">Northern Ireland</span><span style=""font-size: 11pt""> in accordance with and furtherance of the mission, goal and founding principles of the international Special Olympics movement.</span>";
var element = XElement.Parse(input.Replace(" "," "));
string stripped = element.Value;
String input=@“爱尔兰特奥会在弗吉尼亚州提供全年的运动训练和竞技比赛