C# 清理AntiXSS v3输出中的html编码文本（#十进制表示法）_C#_Html_Regex_Xss

C# 清理AntiXSS v3输出中的html编码文本（#十进制表示法）

c# html regex

C# 清理AntiXSS v3输出中的html编码文本（#十进制表示法）,c#,html,regex,xss,C#,Html,Regex,Xss,我打算在博客引擎XSS safe中发表评论。尝试了很多不同的方法，但发现非常困难当我显示注释时，我首先使用html对整个内容进行编码。然后，我尝试使用白名单方法对安全标记进行html解码在重构代码的Atwood的“清理HTML”线程中一直在研究我的问题是AntiXss库将值编码为&#DECIMAL；我不知道如何重写Steve的例子，因为我的正则表达式知识有限我尝试了以下代码，我只是将实体替换为十进制形式，但它不能正常工作 < with < >

我打算在博客引擎XSS safe中发表评论。尝试了很多不同的方法，但发现非常困难

当我显示注释时，我首先使用html对整个内容进行编码。然后，我尝试使用白名单方法对安全标记进行html解码

在重构代码的Atwood的“清理HTML”线程中一直在研究

我的问题是AntiXss库将值编码为&#DECIMAL；我不知道如何重写Steve的例子，因为我的正则表达式知识有限

我尝试了以下代码，我只是将实体替换为十进制形式，但它不能正常工作

&lt; with &#60;
&gt; with &#62;

我的重写：

class HtmlSanitizer
{
    /// <summary>
    /// A regex that matches things that look like a HTML tag after HtmlEncoding.  Splits the input so we can get discrete
    /// chunks that start with &lt; and ends with either end of line or &gt;
    /// </summary>
    private static Regex _tags = new Regex("&#60;(?!&#62;).+?(&#62;|$)", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);


    /// <summary>
    /// A regex that will match tags on the whitelist, so we can run them through 
    /// HttpUtility.HtmlDecode
    /// FIXME - Could be improved, since this might decode &gt; etc in the middle of
    /// an a/link tag (i.e. in the text in between the opening and closing tag)
    /// </summary>
    private static Regex _whitelist = new Regex(@"
^&#60;/?(a|b(lockquote)?|code|em|h(1|2|3)|i|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)&#62;$
|^&#60;(b|h)r\s?/?&#62;$
|^&#60;a(?!&#62;).+?&#62;$
|^&#60;img(?!&#62;).+?/?&#62;$",


      RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace |
      RegexOptions.ExplicitCapture | RegexOptions.Compiled);

    /// <summary>
    /// HtmlDecode any potentially safe HTML tags from the provided HtmlEncoded HTML input using 
    /// a whitelist based approach, leaving the dangerous tags Encoded HTML tags
    /// </summary>
    public static string Sanitize(string html)
    {

        string tagname = "";
        Match tag;
        MatchCollection tags = _tags.Matches(html);
        string safeHtml = "";

        // iterate through all HTML tags in the input
        for (int i = tags.Count - 1; i > -1; i--)
        {
            tag = tags[i];
            tagname = tag.Value.ToLowerInvariant();

            if (_whitelist.IsMatch(tagname))
            {
                // If we find a tag on the whitelist, run it through 
                // HtmlDecode, and re-insert it into the text
                safeHtml = HttpUtility.HtmlDecode(tag.Value);
                html = html.Remove(tag.Index, tag.Length);
                html = html.Insert(tag.Index, safeHtml);
            }

        }

        return html;
    }

}

当我运行上面的Sanitize（字符串html）版本时，它会给我：

<p><script language="javascript">alert&#40;&#39;XSS&#39;&#41;</script><b>bold should work</b></p>

警报('XSS&39)大胆应该行得通

正则表达式与白名单中的脚本相匹配，我不想要。如果有任何帮助，我们将不胜感激。

您是否考虑过使用Markdown或VBCode或类似的方法让用户标记他们的评论？然后您可以禁止所有HTML

如果您必须允许HTML，那么我将考虑使用HTML解析器（以HTMLTIDY的精神），并在那里执行白色列表。p> 是的，我正在使用带有标记的WMD编辑器，但我希望用户能够发布HTML和类似于堆栈溢出的代码示例，所以我不想完全禁止HTML

我一直在看，但还没有试过。但是，我正在使用来确保HTML是正确的（没有孤立标记）。这是在我运行Antix之前完成的

如果我不能让我当前的解决方案按我喜欢的方式工作，我会尝试HTML Tidy，谢谢你的建议。

我在Mac电脑上，所以我无法测试你的C代码。但对我来说，似乎应该让_白名单regexp只与标记名一起工作。这可能意味着您必须进行两次传递，一次用于开始标记，另一次用于结束标记。但是这会使它变得更简单。

您的问题是C#没有解释您的regexp。你需要避开#-标志。没有逃逸，它匹配的太多了

private static Regex _whitelist = new Regex(@"
    ^&\#60;(&\#47;)? (a|b(lockquote)?|code|em|h(1|2|3)|i|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)&\#62;$
    |^&\#60;(b|h)r\s?(&\#47;)?&\#62;$
    |^&\#60;a(?!&\#62;).+?&\#62;$
    |^&\#60;img(?!&\#62;).+?(&\#47;)?&\#62;$",

    RegexOptions.Singleline |
    RegexOptions.IgnorePatternWhitespace |
    RegexOptions.ExplicitCapture 
    RegexOptions.Compiled
 );

更新2：

你可能会对这个网站感兴趣。

如果有人对使用这个网站感兴趣，我会在这里再次发布完整的代码（稍微经过重构并添加更新的注释）

我还决定从白名单中删除img标签，因为@Pez和@some指出允许这样做可能会很危险

还必须指出，我还没有针对可能的XSS攻击对其进行适当的测试。对于我来说，这只是一个说明点，说明这个方法的效果如何

class HtmlSanitizer
{
    /// <summary>
    /// A regex that matches things that look like a HTML tag after HtmlEncoding to &#DECIMAL; notation. Microsoft AntiXSS 3.0 can be used to preform this. Splits the input so we can get discrete
    /// chunks that start with &#60; and ends with either end of line or &#62;
    /// </summary>
    private static readonly Regex _tags = new Regex(@"&\#60;(?!&\#62;).+?(&\#62;|$)", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);


    /// <summary>
    /// A regex that will match tags on the whitelist, so we can run them through 
    /// HttpUtility.HtmlDecode
    /// FIXME - Could be improved, since this might decode &#60; etc in the middle of
    /// an a/link tag (i.e. in the text in between the opening and closing tag)
    /// </summary>

    private static readonly Regex _whitelist = new Regex(@"
^&\#60;(&\#47;)? (a|b(lockquote)?|code|em|h(1|2|3)|i|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)&\#62;$
|^&\#60;(b|h)r\s?(&\#47;)?&\#62;$
|^&\#60;a(?!&\#62;).+?&\#62;$",


      RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace |
      RegexOptions.ExplicitCapture | RegexOptions.Compiled);

    /// <summary>
    /// HtmlDecode any potentially safe HTML tags from the provided HtmlEncoded HTML input using 
    /// a whitelist based approach, leaving the dangerous tags Encoded HTML tags
    /// </summary>
    public static string Sanitize(string html)
    {
        Match tag;
        MatchCollection tags = _tags.Matches(html);

        // iterate through all HTML tags in the input
        for (int i = tags.Count - 1; i > -1; i--)
        {
            tag = tags[i];
            string tagname = tag.Value.ToLowerInvariant();

            if (_whitelist.IsMatch(tagname))
            {
                // If we find a tag on the whitelist, run it through 
                // HtmlDecode, and re-insert it into the text
                string safeHtml = HttpUtility.HtmlDecode(tag.Value);
                html = html.Remove(tag.Index, tag.Length);
                html = html.Insert(tag.Index, safeHtml);
            }
        }
        return html;
    }
}

类HtmlSanitizer
{
/// 
///一个正则表达式，它匹配HtmlEncoding为&#DECIMAL；表示法后看起来像HTML标记的内容。可以使用Microsoft AntiXSS 3.0来执行此操作。拆分输入，以便获得离散值
///以<；开头，以行或>结尾的块；
/// 
私有静态只读Regex|u标记=新Regex（@“&”&\#60；（？！&\#62；）.+？（&\#62；|$）”，RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled）；
/// 
///一个正则表达式，它将匹配白名单上的标记，因此我们可以运行它们
///HttpUtility.HtmlDecode
///FixMe -可以改进，因为这可能在中间解码
///a/link标记（即，在开始标记和结束标记之间的文本中）
/// 
私有静态只读正则表达式\u白名单=新正则表达式（@“
^&\#60；（/；）？（a | b（加锁）|代码| em | h（1 | 2 | 3）| i | li | ol | p（re）| s（ub | up | trong | trike）| ul）和| 62$
|^&\#60；（b|h）r\s（/；）>$
|^&\#60；a（？），
RegexOptions.Singleline | RegexOptions.ignorepattern空格|
RegexOptions.ExplicitCapture | RegexOptions.Compiled）；
/// 
///HTMLDE使用提供的HtmlEncoded HTML输入对任何潜在安全的HTML标记进行编码
///基于白名单的方法，将危险标签编码为HTML标签
/// 
公共静态字符串清理（字符串html）
{
匹配标签；
MatchCollection标记=_tags.Matches（html）；
//迭代输入中的所有HTML标记
对于（inti=tags.Count-1；i>-1；i--）
{
标签=标签[i]；
字符串标记名=tag.Value.ToLowerInvariant（）；
if（_whitelist.IsMatch（标记名））
{
//如果我们在白名单上找到了一个标签，请运行它
//HtmlDecode，并将其重新插入到文本中
字符串safeHtml=HttpUtility.HtmlDecode（tag.Value）；
html=html.Remove（tag.Index，tag.Length）；
html=html.Insert（tag.Index，safeHtml）；
}
}
返回html；
}
}

刚刚记住：在过去的24小时里，我浏览了所有这些链接。真不敢相信事情会这么复杂。正如他们在CSRF文章“默认情况下Web开发很可怕”的评论中所引述的，这是非常正确的。当心白名单的IMG标签。OneError属性可用于插入脚本。感谢您的提示。是的，图像属性似乎很难处理，无论如何我不认为我需要它们在评论中。img和a也可以有mousover、mouseout和其他事件…但是在Stackoverflow上，所有HTML都被转义。没有whiste列表。还是我弄错了？我应该把这作为对你答案的评论。。不知道堆栈溢出是如何处理的，我想试试。我想你是对的，也许禁用html，只接受降价就足够了。不知道堆栈溢出上的注释文本框是否使用WDM编辑器以及答案框？我检查了Html Agility打包了一些。从表面上看，它似乎可以帮助白名单。但是无论如何，最好不要使用HTML。谢谢。一分钟前，我通过阅读正则表达式的参考资料发现了这一点。对于您的版本，起始标记将被正确替换。但是结尾标记不匹配，因为斜杠是t编码的

private static Regex _whitelist = new Regex(@"
    ^&\#60;(&\#47;)? (a|b(lockquote)?|code|em|h(1|2|3)|i|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)&\#62;$
    |^&\#60;(b|h)r\s?(&\#47;)?&\#62;$
    |^&\#60;a(?!&\#62;).+?&\#62;$
    |^&\#60;img(?!&\#62;).+?(&\#47;)?&\#62;$",

    RegexOptions.Singleline |
    RegexOptions.IgnorePatternWhitespace |
    RegexOptions.ExplicitCapture 
    RegexOptions.Compiled
 );

class HtmlSanitizer
{
    /// <summary>
    /// A regex that matches things that look like a HTML tag after HtmlEncoding to &#DECIMAL; notation. Microsoft AntiXSS 3.0 can be used to preform this. Splits the input so we can get discrete
    /// chunks that start with &#60; and ends with either end of line or &#62;
    /// </summary>
    private static readonly Regex _tags = new Regex(@"&\#60;(?!&\#62;).+?(&\#62;|$)", RegexOptions.Singleline | RegexOptions.ExplicitCapture | RegexOptions.Compiled);


    /// <summary>
    /// A regex that will match tags on the whitelist, so we can run them through 
    /// HttpUtility.HtmlDecode
    /// FIXME - Could be improved, since this might decode &#60; etc in the middle of
    /// an a/link tag (i.e. in the text in between the opening and closing tag)
    /// </summary>

    private static readonly Regex _whitelist = new Regex(@"
^&\#60;(&\#47;)? (a|b(lockquote)?|code|em|h(1|2|3)|i|li|ol|p(re)?|s(ub|up|trong|trike)?|ul)&\#62;$
|^&\#60;(b|h)r\s?(&\#47;)?&\#62;$
|^&\#60;a(?!&\#62;).+?&\#62;$",


      RegexOptions.Singleline | RegexOptions.IgnorePatternWhitespace |
      RegexOptions.ExplicitCapture | RegexOptions.Compiled);

    /// <summary>
    /// HtmlDecode any potentially safe HTML tags from the provided HtmlEncoded HTML input using 
    /// a whitelist based approach, leaving the dangerous tags Encoded HTML tags
    /// </summary>
    public static string Sanitize(string html)
    {
        Match tag;
        MatchCollection tags = _tags.Matches(html);

        // iterate through all HTML tags in the input
        for (int i = tags.Count - 1; i > -1; i--)
        {
            tag = tags[i];
            string tagname = tag.Value.ToLowerInvariant();

            if (_whitelist.IsMatch(tagname))
            {
                // If we find a tag on the whitelist, run it through 
                // HtmlDecode, and re-insert it into the text
                string safeHtml = HttpUtility.HtmlDecode(tag.Value);
                html = html.Remove(tag.Index, tag.Length);
                html = html.Insert(tag.Index, safeHtml);
            }
        }
        return html;
    }
}