C# XPath来确定前辈或祖先是否具有内容,而不考虑级别
我正在研究一个以原始HTML阅读新闻文章的过程。新闻文章来源多种多样,因此不一致 目标是定位文章中的第一个图像,如果之前没有内容,则将图像去掉 我们的数据不一致(因为它来自不同的来源),例如 第一张图片前没有内容示例:C# XPath来确定前辈或祖先是否具有内容,而不考虑级别,c#,dom,xpath,html-agility-pack,C#,Dom,Xpath,Html Agility Pack,我正在研究一个以原始HTML阅读新闻文章的过程。新闻文章来源多种多样,因此不一致 目标是定位文章中的第一个图像,如果之前没有内容,则将图像去掉 我们的数据不一致(因为它来自不同的来源),例如 第一张图片前没有内容示例: A: <p><a href="..."><img src="..." /></a></p><p>content</p> B: <img src="..." /><p>co
A: <p><a href="..."><img src="..." /></a></p><p>content</p>
B: <img src="..." /><p>content</p>
C: <div><p><img src="..." /></p></div><div><p>content</p></div>
D: <div><img src="..." /></div><div><p>content</p></div>
E: <div><p><a href="..."><img src="..." /></a>content</p></div>
及
若要查找图像的所有前辈和祖先,请分别查找无内容图像的所有前辈和祖先
HtmlNode firstImageInArticle = rawContentModifier.DocumentNode.SelectSingleNode("(//img)[1]");
if (firstImageInArticle == null) { // there are no images in this article - return HTML as is.
return html;
}
// count all nodes and all empty nodes preceding the first image in the article
HtmlNodeCollection allNodesPrecedingFirstImage = rawContentModifier.DocumentNode.SelectNodes("(//img)[1]/preceding::*|(//img)[1]/ancestor::*/@id");
HtmlNodeCollection allEmptyNodesPrecedingFirstImage = rawContentModifier.DocumentNode.SelectNodes("(//img)[1]/preceding::*[not(text())]|(//img)[1]/ancestor::*[not(text())]/@id");
// if there are no nodes preceding the first image, we have no content prior to the image and therefore should delete it
if (allEmptyNodesPrecedingFirstImage == null || allNodesPrecedingFirstImage == null) {
HtmlNode nodeToDelete = rawContentModifier.DocumentNode.SelectSingleNode(firstImageInArticle.XPath);
// keep moving up the chain until we find the topmost parent that doesn't have any other children
while (nodeToDelete.ParentNode.ChildNodes.Count == 1)
{
nodeToDelete = nodeToDelete.ParentNode;
}
nodeToDelete.Remove();
html = rawContentModifier.DocumentNode.InnerHtml;
return html;
}
// if the number of empty nodes preceding the first image is equal to all nodes preceding the first image...
// then we know that we basically have no content prior to the image and therefore can delete the first image
if (allNodesPrecedingFirstImage.Count == allEmptyNodesPrecedingFirstImage.Count) {
HtmlNode nodeToDelete = rawContentModifier.DocumentNode.SelectSingleNode(firstImageInArticle.XPath);
// keep moving up the chain until we find the topmost parent that doesn't have any other children
while (nodeToDelete.ParentNode.ChildNodes.Count == 1)
{
nodeToDelete = nodeToDelete.ParentNode;
}
nodeToDelete.Remove();
html = rawContentModifier.DocumentNode.InnerHtml;
return html;
}
编辑以添加示例E,该示例在我的代码中失败,因为p标记包含图像后面的内容,但不是自包含的
(//img)[1]/preceding::*[not(text())]|(//img)[1]/ancestor::*[not(text())]/@id"
HtmlNode firstImageInArticle = rawContentModifier.DocumentNode.SelectSingleNode("(//img)[1]");
if (firstImageInArticle == null) { // there are no images in this article - return HTML as is.
return html;
}
// count all nodes and all empty nodes preceding the first image in the article
HtmlNodeCollection allNodesPrecedingFirstImage = rawContentModifier.DocumentNode.SelectNodes("(//img)[1]/preceding::*|(//img)[1]/ancestor::*/@id");
HtmlNodeCollection allEmptyNodesPrecedingFirstImage = rawContentModifier.DocumentNode.SelectNodes("(//img)[1]/preceding::*[not(text())]|(//img)[1]/ancestor::*[not(text())]/@id");
// if there are no nodes preceding the first image, we have no content prior to the image and therefore should delete it
if (allEmptyNodesPrecedingFirstImage == null || allNodesPrecedingFirstImage == null) {
HtmlNode nodeToDelete = rawContentModifier.DocumentNode.SelectSingleNode(firstImageInArticle.XPath);
// keep moving up the chain until we find the topmost parent that doesn't have any other children
while (nodeToDelete.ParentNode.ChildNodes.Count == 1)
{
nodeToDelete = nodeToDelete.ParentNode;
}
nodeToDelete.Remove();
html = rawContentModifier.DocumentNode.InnerHtml;
return html;
}
// if the number of empty nodes preceding the first image is equal to all nodes preceding the first image...
// then we know that we basically have no content prior to the image and therefore can delete the first image
if (allNodesPrecedingFirstImage.Count == allEmptyNodesPrecedingFirstImage.Count) {
HtmlNode nodeToDelete = rawContentModifier.DocumentNode.SelectSingleNode(firstImageInArticle.XPath);
// keep moving up the chain until we find the topmost parent that doesn't have any other children
while (nodeToDelete.ParentNode.ChildNodes.Count == 1)
{
nodeToDelete = nodeToDelete.ParentNode;
}
nodeToDelete.Remove();
html = rawContentModifier.DocumentNode.InnerHtml;
return html;
}