C# 如何在Asp.net中删除HTML源中缺少的标记_C#_Html_Asp.net_Regex

C# 如何在Asp.net中删除HTML源中缺少的标记

c# html asp.net regex

C# 如何在Asp.net中删除HTML源中缺少的标记,c#,html,asp.net,regex,C#,Html,Asp.net,Regex,有时我们会发现，我们从一些网站收到的HTML源代码没有正确的标记结尾，这会影响我们的UI。所以，就像您好，段落从这里开始一些文本，没有结束标记而且没有结束标记我想保留HTML格式，并希望这样 <br /><p>hello the para start here </p> some text and no ending tag 您好，段落从这里开始一些文本，没有结束标记还有一件事是，有时我们会在开始时得到结束标记，这也应该通过算法来解决。嘿，伙计

有时我们会发现，我们从一些网站收到的HTML源代码没有正确的标记结尾，这会影响我们的UI。所以，就像


您好，段落从这里开始一些文本，没有结束标记

而且没有结束标记

我想保留HTML格式，并希望这样

<br /><p>hello the para start here </p> some text and no ending tag


您好，段落从这里开始一些文本，没有结束标记

还有一件事是，有时我们会在开始时得到结束标记，这也应该通过算法来解决。

嘿，伙计们，我想了很久，最后我有了解决问题的代码，我将它发布在这里，以便其他人可以从中受益

 public static string RemoveIncompleteTags(string source, string tag)
    {
        source = source.Replace("  ", " ");
        source = source.Replace("/n", string.Empty).Replace("/r", string.Empty).Replace("/t", string.Empty);
        source = source.Replace("<" + tag + "></" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + ">  </" + tag + ">", string.Empty);
        Dictionary<int, string> oDict = new Dictionary<int, string>();
        string[] souceList;
        Dictionary<int, string> final = new Dictionary<int, string>();
        bool opening = false;
        bool operate = false;
        source = source.Replace("  ", " ");
        source = source.Replace(">", "> ").Replace("<", " <");
        source = source.Replace(" >", ">").Replace("< ", "<");
        source = source.Replace("  ", " ").Replace("  ", " ");
        souceList = source.Split(' ');
        for (int i = 0; i < souceList.Length; i++)
        {
            string word = souceList[i];
            if (word.ToLower() == "<" + tag.ToLower() + ">")
            {
                opening = true;
                operate = true;
            }
            else if (word.ToLower() == "</" + tag.ToLower() + ">")
            {
                opening = false;
                operate = true;
            }
            if (operate)
            {
                if (opening)
                {
                    oDict.Add(i, word);
                    final.Add(i, word);
                }
                else
                {
                    if (oDict.Count != 0)
                    {
                        oDict.Remove(oDict.Last().Key);//.ToList().RemoveAt(oDict.Count - 1);
                        final.Add(i, word);
                    }
                    else
                    {
                        // need not to add to the output string 
                        // code if you want to log
                    }
                }
                operate = false;
                opening = false;
            }
            else
            {
                final.Add(i, word);
            }
        }
        if (final.Count > 0)
        {
            if (oDict.Count > 0)
            {
                foreach (var key in oDict.Keys)
                {
                    final.Remove(key);
                }
            }
            StringBuilder fText = new StringBuilder();
            final.ToList().ForEach(wd =>
                {
                    if (wd.Value.Trim().Length > 0)
                        fText.Append(wd.Value.Trim() + " ");
                });
            return fText.ToString().Trim();
        }
        else
        {
            return string.Empty;
        }
    }

publicstaticstringremoveincompletetags（字符串源，字符串标记）
{
source=source.Replace（“，”）；
source=source.Replace（“/n”，string.Empty）。Replace（“/r”，string.Empty）。Replace（“/t”，string.Empty）；
source=source.Replace（“，string.Empty”）；
source=source.Replace（“，string.Empty”）；
source=source.Replace（“，string.Empty”）；
字典oDict=新字典（）；
字符串[]灵魂主义者；
Dictionary final=新字典（）；
bool-opening=false；
布尔操作=假；
source=source.Replace（“，”）；
source=source.Replace（“>”，“>”）.Replace（“”）.Replace（“<”，“这是家庭作业吗？就像做一个HTML编译器一样……我正在做一个文章翻译，希望保留文章的原始格式，这是为了GabbleOn.com，请参见或-它将有效地尝试将HTML转换为一致的（X）或者看看或者-有很多关于堆栈溢出的资源。有大量的HTML解析库，它们在这方面比正则表达式要好得多。（还有一篇必读的堆栈溢出文章说明了这一点，但我不想费心链接它。）这些解析器将尽最大努力将破损的HTML转换为未破损的HTML。但最终可能会有一些东西太破损，作者只需要修复它。对于这件小事，我不想使用整个HTML敏捷包。这是一项勇敢的努力，但试图处理（可能格式错误）使用正则表达式或字符串模式的HTML会带来痛苦。dash在上面关于tidy的建议似乎是一条更好的途径。
 public static string RemoveIncompleteTags(string source, string tag)
    {
        source = source.Replace("  ", " ");
        source = source.Replace("/n", string.Empty).Replace("/r", string.Empty).Replace("/t", string.Empty);
        source = source.Replace("<" + tag + "></" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + ">  </" + tag + ">", string.Empty);
        Dictionary<int, string> oDict = new Dictionary<int, string>();
        string[] souceList;
        Dictionary<int, string> final = new Dictionary<int, string>();
        bool opening = false;
        bool operate = false;
        source = source.Replace("  ", " ");
        source = source.Replace(">", "> ").Replace("<", " <");
        source = source.Replace(" >", ">").Replace("< ", "<");
        source = source.Replace("  ", " ").Replace("  ", " ");
        souceList = source.Split(' ');
        for (int i = 0; i < souceList.Length; i++)
        {
            string word = souceList[i];
            if (word.ToLower() == "<" + tag.ToLower() + ">")
            {
                opening = true;
                operate = true;
            }
            else if (word.ToLower() == "</" + tag.ToLower() + ">")
            {
                opening = false;
                operate = true;
            }
            if (operate)
            {
                if (opening)
                {
                    oDict.Add(i, word);
                    final.Add(i, word);
                }
                else
                {
                    if (oDict.Count != 0)
                    {
                        oDict.Remove(oDict.Last().Key);//.ToList().RemoveAt(oDict.Count - 1);
                        final.Add(i, word);
                    }
                    else
                    {
                        // need not to add to the output string 
                        // code if you want to log
                    }
                }
                operate = false;
                opening = false;
            }
            else
            {
                final.Add(i, word);
            }
        }
        if (final.Count > 0)
        {
            if (oDict.Count > 0)
            {
                foreach (var key in oDict.Keys)
                {
                    final.Remove(key);
                }
            }
            StringBuilder fText = new StringBuilder();
            final.ToList().ForEach(wd =>
                {
                    if (wd.Value.Trim().Length > 0)
                        fText.Append(wd.Value.Trim() + " ");
                });
            return fText.ToString().Trim();
        }
        else
        {
            return string.Empty;
        }
    }