C# HTML敏捷包-删除不需要的标签而不删除内容？_C#_Html Agility Pack

C# HTML敏捷包-删除不需要的标签而不删除内容？

C# HTML敏捷包-删除不需要的标签而不删除内容？,c#,html-agility-pack,C#,Html Agility Pack,我在这里看到了一些相关的问题，但它们并没有完全谈论我所面临的问题我想使用从HTML中删除不需要的标记，而不会丢失标记中的内容例如，在我的场景中，我想保留标记“b”、“I”和“u” 对于输入，如：我的段落和div是斜体和粗体的生成的HTML应为：我的段落和div是斜体和粗体的我尝试使用HtmlNode的Remove方法，但它也删除了我的内容。有什么建议吗？在删除节点之前，获取其父节点及其InnerText，然后删除该节点并将InnerText重新分配给父节点 var parent =

我在这里看到了一些相关的问题，但它们并没有完全谈论我所面临的问题

我想使用从HTML中删除不需要的标记，而不会丢失标记中的内容

例如，在我的场景中，我想保留标记“

”、“

”和“

”

对于输入，如：

我的段落和div是斜体和粗体的

生成的HTML应为：

我的段落和div是斜体和粗体的

我尝试使用

HtmlNode

的

Remove

方法，但它也删除了我的内容。有什么建议吗？

在删除节点之前，获取其父节点及其

InnerText

，然后删除该节点并将

InnerText

重新分配给父节点

var parent = node.ParentNode;
var innerText = parent.InnerText;
node.Remove();
parent.AppendChild(doc.CreateTextNode(innerText));

我根据奥德的建议写了一个算法。给你。工作起来很有魅力

它删除除

strong

、

em

、

和原始文本节点以外的所有标记

internal static string RemoveUnwantedTags(string data)
{
    if(string.IsNullOrEmpty(data)) return string.Empty;

    var document = new HtmlDocument();
    document.LoadHtml(data);

    var acceptableTags = new String[] { "strong", "em", "u"};

    var nodes = new Queue<HtmlNode>(document.DocumentNode.SelectNodes("./*|./text()"));
    while(nodes.Count > 0)
    {
        var node = nodes.Dequeue();
        var parentNode = node.ParentNode;

        if(!acceptableTags.Contains(node.Name) && node.Name != "#text")
        {
            var childNodes = node.SelectNodes("./*|./text()");

            if (childNodes != null)
            {
                foreach (var child in childNodes)
                {
                    nodes.Enqueue(child);
                    parentNode.InsertBefore(child, node);
                }
            }

            parentNode.RemoveChild(node);

        }
    }

    return document.DocumentNode.InnerHtml;
}

内部静态字符串移除Unventedtags（字符串数据）
{
if（string.IsNullOrEmpty（data））返回string.Empty；
var document=新的HtmlDocument（）；
document.LoadHtml（数据）；
var acceptableTags=新字符串[]{“强”、“em”、“u”}；
var nodes=新队列（document.DocumentNode.SelectNodes（“./*|/text（）”）；
而（nodes.Count>0）
{
var node=nodes.Dequeue（）；
var parentNode=node.parentNode；
if（！acceptableTags.Contains（node.Name）&&node.Name！=“#text”）
{
var childNodes=node.SelectNodes（“./*|/text（）”）；
if（childNodes！=null）
{
foreach（childNodes中的变量child）
{
节点排队（子节点）；
parentNode.InsertBefore（子节点，节点）；
}
}
parentNode.RemoveChild（节点）；
}
}
返回document.DocumentNode.InnerHtml；
}

尝试以下方法，您可能会发现它比其他建议的解决方案更整洁：

public static int RemoveNodesButKeepChildren(this HtmlNode rootNode, string xPath)
{
    HtmlNodeCollection nodes = rootNode.SelectNodes(xPath);
    if (nodes == null)
        return 0;
    foreach (HtmlNode node in nodes)
        node.RemoveButKeepChildren();
    return nodes.Count;
}

public static void RemoveButKeepChildren(this HtmlNode node)
{
    foreach (HtmlNode child in node.ChildNodes)
        node.ParentNode.InsertBefore(child, node);
    node.Remove();
}

public static bool TestYourSpecificExample()
{
    string html = "<p>my paragraph <div>and my <b>div</b></div> are <i>italic</i> and <b>bold</b></p>";
    HtmlDocument document = new HtmlDocument();
    document.LoadHtml(html);
    document.DocumentNode.RemoveNodesButKeepChildren("//div");
    document.DocumentNode.RemoveNodesButKeepChildren("//p");
    return document.DocumentNode.InnerHtml == "my paragraph and my <b>div</b> are <i>italic</i> and <b>bold</b>";
}

public static int RemoveNodesButKeepChildren（此HtmlNode根节点，字符串xPath）
{
HtmlNodeCollection节点=rootNode.SelectNodes（xPath）；
如果（节点==null）
返回0；
foreach（节点中的HtmlNode节点）
node.RemoveButKeepChildren（）；
返回节点数；
}
公共静态void RemoveButKeepChildren（此HtmlNode节点）
{
foreach（node.ChildNodes中的HtmlNode子节点）
node.ParentNode.InsertBefore（子节点，节点）；
node.Remove（）；
}
公共静态bool TestYourSpecificExample（）
{
字符串html=“我的段落和div是斜体和粗体的”；
HtmlDocument document=新的HtmlDocument（）；
document.LoadHtml（html）；
document.DocumentNode.RemoveNodesButKeepChildren（“//div”）；
document.DocumentNode.RemoveNodesButKeepChildren（“//p”）；
return document.DocumentNode.InnerHtml==“我的段落和div为斜体和粗体”；
}

如何从html字符串中递归删除给定的不需要的html标记列表我接受了@mathias answer并改进了他的扩展方法，这样您就可以提供一个标签列表，作为

列表

（例如

{“a”、“p”、“hr”}

）。我还修复了逻辑，使其能够正常递归工作：

public static string RemoveUnwantedHtmlTags(this string html, List<string> unwantedTags)
    {
        if (String.IsNullOrEmpty(html))
        {
            return html;
        }

        var document = new HtmlDocument();
        document.LoadHtml(html);

        HtmlNodeCollection tryGetNodes = document.DocumentNode.SelectNodes("./*|./text()");

        if (tryGetNodes == null || !tryGetNodes.Any())
        {
            return html;
        }

        var nodes = new Queue<HtmlNode>(tryGetNodes);

        while (nodes.Count > 0)
        {
            var node = nodes.Dequeue();
            var parentNode = node.ParentNode;

            var childNodes = node.SelectNodes("./*|./text()");

            if (childNodes != null)
            {
                foreach (var child in childNodes)
                {
                    nodes.Enqueue(child);                       
                }
            }

            if (unwantedTags.Any(tag => tag == node.Name))
            {               
                if (childNodes != null)
                {
                    foreach (var child in childNodes)
                    {
                        parentNode.InsertBefore(child, node);
                    }
                }

                parentNode.RemoveChild(node);

            }
        }

        return document.DocumentNode.InnerHtml;
    }

公共静态字符串RemoveUnwantedHtmlTags（此字符串为html，列表为unwantedTags）
{
if（String.IsNullOrEmpty（html））
{
返回html；
}
var document=新的HtmlDocument（）；
document.LoadHtml（html）；
HtmlNodeCollection tryGetNodes=document.DocumentNode.SelectNodes（“./*|/text（）”）；
if（tryGetNodes==null | |！tryGetNodes.Any（））
{
返回html；
}
var节点=新队列（tryGetNodes）；
而（nodes.Count>0）
{
var node=nodes.Dequeue（）；
var parentNode=node.parentNode；
var childNodes=node.SelectNodes（“./*|/text（）”）；
if（childNodes！=null）
{
foreach（childNodes中的变量child）
{
节点排队（子节点）；
}
}
if（unwantedTags.Any（tag=>tag==node.Name））
{               
if（childNodes！=null）
{
foreach（childNodes中的变量child）
{
parentNode.InsertBefore（子节点，节点）；
}
}
parentNode.RemoveChild（节点）；
}
}
返回document.DocumentNode.InnerHtml；
}

如果您不想使用Html agility pack，但仍想删除不需要的Html标记，则可以按如下所示操作

public static string RemoveHtmlTags(string strHtml)
    {
        string strText = Regex.Replace(strHtml, "<(.|\n)*?>", String.Empty);
        strText = HttpUtility.HtmlDecode(strText);
        strText = Regex.Replace(strText, @"\s+", " ");
        return strText;
    }

publicstaticstringremovehtmltags（stringstrhtml）
{
string strText=Regex.Replace（strHtml，“，string.Empty）；
strText=HttpUtility.HtmlDecode（strText）；
strText=Regex.Replace（strText，@“\s+”，“”）；
返回strText；
}

InnerText破坏了层次结构。想象更多的深度，如以下内容：

我的段落和div是斜体和粗体的

。InnerText会破坏HTML并将其转换为文本，但这不是我在所有场景中都想要的。我仍然需要允许标记的HTML。@MathiasLykkegaardLorenzen-我的观点是，您只需要使用不需要的元素就可以了。是的。但是如果你在这里查看这些数据呢

lalala omg这是粗体的

。首先，您将发现

span

元素。这是不允许的，因此您可以用它的

InnerText

值交换它。然而，这破坏了我仍然需要的内部

标记。我希望这能更好地解释它。@MathiasLykkegaardLorenzen-当然，你也可以简单地做同样的事情，但是用节点的

InnerHtml

而不是父节点（也就是说，取节点的

InnerHtml

，替换父节点

InnerHtml