C# 如何在Asp.net中删除HTML源中缺少的标记

C# 如何在Asp.net中删除HTML源中缺少的标记,c#,html,asp.net,regex,C#,Html,Asp.net,Regex,有时我们会发现,我们从一些网站收到的HTML源代码没有正确的标记结尾,这会影响我们的UI。 所以,就像 您好,段落从这里开始一些文本,没有结束标记 而且没有结束标记 我想保留HTML格式,并希望这样 <br /><p>hello the para start here </p> some text and no ending tag 您好,段落从这里开始一些文本,没有结束标记 还有一件事是,有时我们会在开始时得到结束标记,这也应该通过算法来解决。嘿,伙计

有时我们会发现,我们从一些网站收到的HTML源代码没有正确的标记结尾,这会影响我们的UI。 所以,就像


您好,段落从这里开始

一些文本,没有结束标记
而且没有结束标记

我想保留HTML格式,并希望这样

<br /><p>hello the para start here </p> some text and no ending tag

您好,段落从这里开始

一些文本,没有结束标记

还有一件事是,有时我们会在开始时得到结束标记,这也应该通过算法来解决。

嘿,伙计们,我想了很久,最后我有了解决问题的代码,我将它发布在这里,以便其他人可以从中受益

 public static string RemoveIncompleteTags(string source, string tag)
    {
        source = source.Replace("  ", " ");
        source = source.Replace("/n", string.Empty).Replace("/r", string.Empty).Replace("/t", string.Empty);
        source = source.Replace("<" + tag + "></" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + ">  </" + tag + ">", string.Empty);
        Dictionary<int, string> oDict = new Dictionary<int, string>();
        string[] souceList;
        Dictionary<int, string> final = new Dictionary<int, string>();
        bool opening = false;
        bool operate = false;
        source = source.Replace("  ", " ");
        source = source.Replace(">", "> ").Replace("<", " <");
        source = source.Replace(" >", ">").Replace("< ", "<");
        source = source.Replace("  ", " ").Replace("  ", " ");
        souceList = source.Split(' ');
        for (int i = 0; i < souceList.Length; i++)
        {
            string word = souceList[i];
            if (word.ToLower() == "<" + tag.ToLower() + ">")
            {
                opening = true;
                operate = true;
            }
            else if (word.ToLower() == "</" + tag.ToLower() + ">")
            {
                opening = false;
                operate = true;
            }
            if (operate)
            {
                if (opening)
                {
                    oDict.Add(i, word);
                    final.Add(i, word);
                }
                else
                {
                    if (oDict.Count != 0)
                    {
                        oDict.Remove(oDict.Last().Key);//.ToList().RemoveAt(oDict.Count - 1);
                        final.Add(i, word);
                    }
                    else
                    {
                        // need not to add to the output string 
                        // code if you want to log
                    }
                }
                operate = false;
                opening = false;
            }
            else
            {
                final.Add(i, word);
            }
        }
        if (final.Count > 0)
        {
            if (oDict.Count > 0)
            {
                foreach (var key in oDict.Keys)
                {
                    final.Remove(key);
                }
            }
            StringBuilder fText = new StringBuilder();
            final.ToList().ForEach(wd =>
                {
                    if (wd.Value.Trim().Length > 0)
                        fText.Append(wd.Value.Trim() + " ");
                });
            return fText.ToString().Trim();
        }
        else
        {
            return string.Empty;
        }
    }
publicstaticstringremoveincompletetags(字符串源,字符串标记)
{
source=source.Replace(“,”);
source=source.Replace(“/n”,string.Empty)。Replace(“/r”,string.Empty)。Replace(“/t”,string.Empty);
source=source.Replace(“,string.Empty”);
source=source.Replace(“,string.Empty”);
source=source.Replace(“,string.Empty”);
字典oDict=新字典();
字符串[]灵魂主义者;
Dictionary final=新字典();
bool-opening=false;
布尔操作=假;
source=source.Replace(“,”);

source=source.Replace(“>”,“>”).Replace(“”).Replace(“<”,“这是家庭作业吗?就像做一个HTML编译器一样……我正在做一个文章翻译,希望保留文章的原始格式,这是为了GabbleOn.com,请参见或-它将有效地尝试将HTML转换为一致的(X)或者看看或者-有很多关于堆栈溢出的资源。有大量的HTML解析库,它们在这方面比正则表达式要好得多。(还有一篇必读的堆栈溢出文章说明了这一点,但我不想费心链接它。)这些解析器将尽最大努力将破损的HTML转换为未破损的HTML。但最终可能会有一些东西太破损,作者只需要修复它。对于这件小事,我不想使用整个HTML敏捷包。这是一项勇敢的努力,但试图处理(可能格式错误)使用正则表达式或字符串模式的HTML会带来痛苦。dash在上面关于tidy的建议似乎是一条更好的途径。
 public static string RemoveIncompleteTags(string source, string tag)
    {
        source = source.Replace("  ", " ");
        source = source.Replace("/n", string.Empty).Replace("/r", string.Empty).Replace("/t", string.Empty);
        source = source.Replace("<" + tag + "></" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
        source = source.Replace("<" + tag + ">  </" + tag + ">", string.Empty);
        Dictionary<int, string> oDict = new Dictionary<int, string>();
        string[] souceList;
        Dictionary<int, string> final = new Dictionary<int, string>();
        bool opening = false;
        bool operate = false;
        source = source.Replace("  ", " ");
        source = source.Replace(">", "> ").Replace("<", " <");
        source = source.Replace(" >", ">").Replace("< ", "<");
        source = source.Replace("  ", " ").Replace("  ", " ");
        souceList = source.Split(' ');
        for (int i = 0; i < souceList.Length; i++)
        {
            string word = souceList[i];
            if (word.ToLower() == "<" + tag.ToLower() + ">")
            {
                opening = true;
                operate = true;
            }
            else if (word.ToLower() == "</" + tag.ToLower() + ">")
            {
                opening = false;
                operate = true;
            }
            if (operate)
            {
                if (opening)
                {
                    oDict.Add(i, word);
                    final.Add(i, word);
                }
                else
                {
                    if (oDict.Count != 0)
                    {
                        oDict.Remove(oDict.Last().Key);//.ToList().RemoveAt(oDict.Count - 1);
                        final.Add(i, word);
                    }
                    else
                    {
                        // need not to add to the output string 
                        // code if you want to log
                    }
                }
                operate = false;
                opening = false;
            }
            else
            {
                final.Add(i, word);
            }
        }
        if (final.Count > 0)
        {
            if (oDict.Count > 0)
            {
                foreach (var key in oDict.Keys)
                {
                    final.Remove(key);
                }
            }
            StringBuilder fText = new StringBuilder();
            final.ToList().ForEach(wd =>
                {
                    if (wd.Value.Trim().Length > 0)
                        fText.Append(wd.Value.Trim() + " ");
                });
            return fText.ToString().Trim();
        }
        else
        {
            return string.Empty;
        }
    }