C# 如何在Asp.net中删除HTML源中缺少的标记
有时我们会发现,我们从一些网站收到的HTML源代码没有正确的标记结尾,这会影响我们的UI。 所以,就像C# 如何在Asp.net中删除HTML源中缺少的标记,c#,html,asp.net,regex,C#,Html,Asp.net,Regex,有时我们会发现,我们从一些网站收到的HTML源代码没有正确的标记结尾,这会影响我们的UI。 所以,就像 您好,段落从这里开始一些文本,没有结束标记 而且没有结束标记 我想保留HTML格式,并希望这样 <br /><p>hello the para start here </p> some text and no ending tag 您好,段落从这里开始一些文本,没有结束标记 还有一件事是,有时我们会在开始时得到结束标记,这也应该通过算法来解决。嘿,伙计
您好,段落从这里开始一些文本,没有结束标记
而且没有结束标记
我想保留HTML格式,并希望这样
<br /><p>hello the para start here </p> some text and no ending tag
您好,段落从这里开始一些文本,没有结束标记
还有一件事是,有时我们会在开始时得到结束标记,这也应该通过算法来解决。嘿,伙计们,我想了很久,最后我有了解决问题的代码,我将它发布在这里,以便其他人可以从中受益
public static string RemoveIncompleteTags(string source, string tag)
{
source = source.Replace(" ", " ");
source = source.Replace("/n", string.Empty).Replace("/r", string.Empty).Replace("/t", string.Empty);
source = source.Replace("<" + tag + "></" + tag + ">", string.Empty);
source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
Dictionary<int, string> oDict = new Dictionary<int, string>();
string[] souceList;
Dictionary<int, string> final = new Dictionary<int, string>();
bool opening = false;
bool operate = false;
source = source.Replace(" ", " ");
source = source.Replace(">", "> ").Replace("<", " <");
source = source.Replace(" >", ">").Replace("< ", "<");
source = source.Replace(" ", " ").Replace(" ", " ");
souceList = source.Split(' ');
for (int i = 0; i < souceList.Length; i++)
{
string word = souceList[i];
if (word.ToLower() == "<" + tag.ToLower() + ">")
{
opening = true;
operate = true;
}
else if (word.ToLower() == "</" + tag.ToLower() + ">")
{
opening = false;
operate = true;
}
if (operate)
{
if (opening)
{
oDict.Add(i, word);
final.Add(i, word);
}
else
{
if (oDict.Count != 0)
{
oDict.Remove(oDict.Last().Key);//.ToList().RemoveAt(oDict.Count - 1);
final.Add(i, word);
}
else
{
// need not to add to the output string
// code if you want to log
}
}
operate = false;
opening = false;
}
else
{
final.Add(i, word);
}
}
if (final.Count > 0)
{
if (oDict.Count > 0)
{
foreach (var key in oDict.Keys)
{
final.Remove(key);
}
}
StringBuilder fText = new StringBuilder();
final.ToList().ForEach(wd =>
{
if (wd.Value.Trim().Length > 0)
fText.Append(wd.Value.Trim() + " ");
});
return fText.ToString().Trim();
}
else
{
return string.Empty;
}
}
publicstaticstringremoveincompletetags(字符串源,字符串标记)
{
source=source.Replace(“,”);
source=source.Replace(“/n”,string.Empty)。Replace(“/r”,string.Empty)。Replace(“/t”,string.Empty);
source=source.Replace(“,string.Empty”);
source=source.Replace(“,string.Empty”);
source=source.Replace(“,string.Empty”);
字典oDict=新字典();
字符串[]灵魂主义者;
Dictionary final=新字典();
bool-opening=false;
布尔操作=假;
source=source.Replace(“,”);
source=source.Replace(“>”,“>”).Replace(“”).Replace(“<”,“这是家庭作业吗?就像做一个HTML编译器一样……我正在做一个文章翻译,希望保留文章的原始格式,这是为了GabbleOn.com,请参见或-它将有效地尝试将HTML转换为一致的(X)或者看看或者-有很多关于堆栈溢出的资源。有大量的HTML解析库,它们在这方面比正则表达式要好得多。(还有一篇必读的堆栈溢出文章说明了这一点,但我不想费心链接它。)这些解析器将尽最大努力将破损的HTML转换为未破损的HTML。但最终可能会有一些东西太破损,作者只需要修复它。对于这件小事,我不想使用整个HTML敏捷包。这是一项勇敢的努力,但试图处理(可能格式错误)使用正则表达式或字符串模式的HTML会带来痛苦。dash在上面关于tidy的建议似乎是一条更好的途径。
public static string RemoveIncompleteTags(string source, string tag)
{
source = source.Replace(" ", " ");
source = source.Replace("/n", string.Empty).Replace("/r", string.Empty).Replace("/t", string.Empty);
source = source.Replace("<" + tag + "></" + tag + ">", string.Empty);
source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
source = source.Replace("<" + tag + "> </" + tag + ">", string.Empty);
Dictionary<int, string> oDict = new Dictionary<int, string>();
string[] souceList;
Dictionary<int, string> final = new Dictionary<int, string>();
bool opening = false;
bool operate = false;
source = source.Replace(" ", " ");
source = source.Replace(">", "> ").Replace("<", " <");
source = source.Replace(" >", ">").Replace("< ", "<");
source = source.Replace(" ", " ").Replace(" ", " ");
souceList = source.Split(' ');
for (int i = 0; i < souceList.Length; i++)
{
string word = souceList[i];
if (word.ToLower() == "<" + tag.ToLower() + ">")
{
opening = true;
operate = true;
}
else if (word.ToLower() == "</" + tag.ToLower() + ">")
{
opening = false;
operate = true;
}
if (operate)
{
if (opening)
{
oDict.Add(i, word);
final.Add(i, word);
}
else
{
if (oDict.Count != 0)
{
oDict.Remove(oDict.Last().Key);//.ToList().RemoveAt(oDict.Count - 1);
final.Add(i, word);
}
else
{
// need not to add to the output string
// code if you want to log
}
}
operate = false;
opening = false;
}
else
{
final.Add(i, word);
}
}
if (final.Count > 0)
{
if (oDict.Count > 0)
{
foreach (var key in oDict.Keys)
{
final.Remove(key);
}
}
StringBuilder fText = new StringBuilder();
final.ToList().ForEach(wd =>
{
if (wd.Value.Trim().Length > 0)
fText.Append(wd.Value.Trim() + " ");
});
return fText.ToString().Trim();
}
else
{
return string.Empty;
}
}