C# HtmlAgilityPack:xpath和regex

C# HtmlAgilityPack:xpath和regex,c#,regex,html-agility-pack,C#,Regex,Html Agility Pack,我目前正在使用HtmlAgilityPack通过xpath查询搜索某些内容。大概是这样的: var col = doc.DocumentNode.SelectNodes("//*[text()[contains(., 'foo'] or @*.... 现在,我想使用正则表达式在所有html源代码(=文本、标记和属性)中搜索特定内容。如何使用HtmlAgilityPack实现这一点?HtmlAgilityPack能否处理xpath+regex,或者使用regex和HtmlAgilityPack进

我目前正在使用HtmlAgilityPack通过xpath查询搜索某些内容。大概是这样的:

var col = doc.DocumentNode.SelectNodes("//*[text()[contains(., 'foo'] or @*....

现在,我想使用正则表达式在所有html源代码(=文本、标记和属性)中搜索特定内容。如何使用HtmlAgilityPack实现这一点?HtmlAgilityPack能否处理xpath+regex,或者使用regex和HtmlAgilityPack进行搜索的最佳方式是什么?

Html Agility Pack使用底层的.NET xpath实现来支持xpath。幸运的是.NET中的XPATH是完全可扩展的(顺便说一句:微软没有在这项卓越的技术上投入更多的资金真是太遗憾了…)

假设我有一个html:

<div>hello</div>
<div>hallo</div>
它之所以有效,是因为我使用了一个特殊的Xslt/XPath上下文,在这里我定义了一个名为“regex is match”的新XPath函数。以下是SelectNodes实用程序代码:

public static IEnumerable<HtmlNode> SelectNodes(HtmlNodeNavigator navigator, string xpath)
{
    if (navigator == null)
        throw new ArgumentNullException("navigator");

    XPathExpression expr = navigator.Compile(xpath);
    expr.SetContext(new HtmlXsltContext());

    object eval = navigator.Evaluate(expr);
    XPathNodeIterator it = eval as XPathNodeIterator;
    if (it != null)
    {
        while (it.MoveNext())
        {
            HtmlNodeNavigator n = it.Current as HtmlNodeNavigator;
            if (n != null && n.CurrentNode != null)
            {
                yield return n.CurrentNode;
            }
        }
    }
}
公共静态IEnumerable SelectNodes(HtmlNodeNavigator,字符串xpath)
{
如果(导航器==null)
抛出新的ArgumentNullException(“导航器”);
XPathExpression expr=navigator.Compile(xpath);
expr.SetContext(新的HtmlXsltContext());
objecteval=navigator.Evaluate(expr);
XPathNodeIterator it=eval为XPathNodeIterator;
如果(it!=null)
{
while(it.MoveNext())
{
HtmlNodeNavigator n=it.Current作为HtmlNodeNavigator;
如果(n!=null&&n.CurrentNode!=null)
{
收益率返回n.CurrentNode;
}
}
}
}
以下是支持代码:

    public class HtmlXsltContext : XsltContext
    {
        public HtmlXsltContext()
            : base(new NameTable())
        {
        }

        public override int CompareDocument(string baseUri, string nextbaseUri)
        {
            throw new NotImplementedException();
        }

        public override bool PreserveWhitespace(XPathNavigator node)
        {
            throw new NotImplementedException();
        }

        protected virtual IXsltContextFunction CreateHtmlXsltFunction(string prefix, string name, XPathResultType[] ArgTypes)
        {
            return HtmlXsltFunction.GetBuiltIn(this, prefix, name, ArgTypes);
        }

        public override IXsltContextFunction ResolveFunction(string prefix, string name, XPathResultType[] ArgTypes)
        {
            return CreateHtmlXsltFunction(prefix, name, ArgTypes);
        }

        public override IXsltContextVariable ResolveVariable(string prefix, string name)
        {
            throw new NotImplementedException();
        }

        public override bool Whitespace
        {
            get { return true; }
        }
    }

    public abstract class HtmlXsltFunction : IXsltContextFunction
    {
        protected HtmlXsltFunction(HtmlXsltContext context, string prefix, string name, XPathResultType[] argTypes)
        {
            Context = context;
            Prefix = prefix;
            Name = name;
            ArgTypes = argTypes;
        }

        public HtmlXsltContext Context { get; private set; }
        public string Prefix { get; private set; }
        public string Name { get; private set; }
        public XPathResultType[] ArgTypes { get; private set; }

        public virtual int Maxargs
        {
            get { return Minargs; }
        }

        public virtual int Minargs
        {
            get { return 1; }
        }

        public virtual XPathResultType ReturnType
        {
            get { return XPathResultType.String; }
        }

        public abstract object Invoke(XsltContext xsltContext, object[] args, XPathNavigator docContext);

        public static IXsltContextFunction GetBuiltIn(HtmlXsltContext context, string prefix, string name, XPathResultType[] argTypes)
        {
            if (name == "regex-is-match")
                return new RegexIsMatch(context, name);

            // TODO: create other functions here
            return null;
        }

        public static string ConvertToString(object argument, bool outer, string separator)
        {
            if (argument == null)
                return null;

            string s = argument as string;
            if (s != null)
                return s;

            XPathNodeIterator it = argument as XPathNodeIterator;
            if (it != null)
            {
                if (!it.MoveNext())
                    return null;

                StringBuilder sb = new StringBuilder();
                do
                {
                    HtmlNodeNavigator n = it.Current as HtmlNodeNavigator;
                    if (n != null && n.CurrentNode != null)
                    {
                        if (sb.Length > 0 && separator != null)
                        {
                            sb.Append(separator);
                        }

                        sb.Append(outer ? n.CurrentNode.OuterHtml : n.CurrentNode.InnerHtml);
                    }
                }
                while (it.MoveNext());
                return sb.ToString();
            }

            IEnumerable enumerable = argument as IEnumerable;
            if (enumerable != null)
            {
                StringBuilder sb = null;
                foreach (object arg in enumerable)
                {
                    if (sb == null)
                    {
                        sb = new StringBuilder();
                    }

                    if (sb.Length > 0 && separator != null)
                    {
                        sb.Append(separator);
                    }

                    string s2 = ConvertToString(arg, outer, separator);
                    if (s2 != null)
                    {
                        sb.Append(s2);
                    }
                }
                return sb != null ? sb.ToString() : null;
            }

            return string.Format("{0}", argument);
        }

        public class RegexIsMatch : HtmlXsltFunction
        {
            public RegexIsMatch(HtmlXsltContext context, string name)
                : base(context, null, name, null)
            {
            }

            public override XPathResultType ReturnType { get { return XPathResultType.Boolean; } }
            public override int Minargs { get { return 2; } }

            public override object Invoke(XsltContext xsltContext, object[] args, XPathNavigator docContext)
            {
                if (args.Length < 2)
                    return false;

                return Regex.IsMatch(ConvertToString(args[0], false, null), ConvertToString(args[1], false, null));
            }
        }
    }
公共类HtmlXsltContext:XsltContext
{
公共HtmlXsltContext()
:base(新名称表())
{
}
public override int CompareDocument(字符串baseUri、字符串nextbaseUri)
{
抛出新的NotImplementedException();
}
公共覆盖布尔保留空白(XPathNavigator节点)
{
抛出新的NotImplementedException();
}
受保护的虚拟IXsltContextFunction CreateHtmlXsltFunction(字符串前缀、字符串名称、XPathResultType[]ArgTypes)
{
返回HtmlXsltFunction.GetBuiltIn(this、前缀、名称、ArgTypes);
}
公共重写IXsltContextFunction ResolveFunction(字符串前缀、字符串名称、XPathResultType[]ArgTypes)
{
返回CreateHtmlXsltFunction(前缀、名称、参数类型);
}
公共覆盖IXsltContextVariable ResolveVariable(字符串前缀,字符串名称)
{
抛出新的NotImplementedException();
}
公共覆盖布尔空格
{
获取{return true;}
}
}
公共抽象类HtmlXsltFunction:IXsltContextFunction
{
受保护的HtmlXsltFunction(HtmlXsltContext上下文、字符串前缀、字符串名称、XPathResultType[]argTypes)
{
上下文=上下文;
前缀=前缀;
名称=名称;
ArgTypes=ArgTypes;
}
公共HtmlXsltContext上下文{get;private set;}
公共字符串前缀{get;private set;}
公共字符串名称{get;private set;}
公共XPathResultType[]ArgTypes{get;private set;}
公共虚拟整数Maxargs
{
获取{return Minargs;}
}
公共虚拟内塔
{
获取{return 1;}
}
公共虚拟XPathResultType ReturnType
{
获取{返回XPathResultType.String;}
}
公共抽象对象调用(XsltContext XsltContext、对象[]参数、XPathNavigator docContext);
公共静态IXsltContextFunction GetBuiltIn(HtmlXsltContext上下文、字符串前缀、字符串名称、XPathResultType[]argTypes)
{
if(name==“regex是匹配的”)
返回新的RegexIsMatch(上下文、名称);
//TODO:在此处创建其他函数
返回null;
}
公共静态字符串ConvertToString(对象参数、布尔外部、字符串分隔符)
{
if(参数==null)
返回null;
字符串s=参数为字符串;
如果(s!=null)
返回s;
XPathNodeIterator it=作为XPathNodeIterator的参数;
如果(it!=null)
{
如果(!it.MoveNext())
返回null;
StringBuilder sb=新的StringBuilder();
做
{
HtmlNodeNavigator n=it.Current作为HtmlNodeNavigator;
如果(n!=null&&n.CurrentNode!=null)
{
如果(sb.Length>0&&separator!=null)
{
附加(分隔符);
}
sb.Append(outer?n.CurrentNode.OuterHtml:n.CurrentNode.InnerHtml);
}
}
while(it.MoveNext());
使某人返回字符串();
}
IEnumerable enumerable=参数为IEnumerable;
if(可枚举!=null)
{
StringBuilder sb=null;
foreach(可枚举中的对象参数)
{
如果(sb==null)
{
sb=新的StringBuilder();
}
如果(sb.Length>0&&separator!=null)
{
附加(分隔符);
}
字符串s2=ConvertToString(arg,外部,分隔符);
如果(s2!=null)
{
sb.追加(s2);
}
}
返回sb!=null?sb.ToString():null;
}
返回string.Format(“{0}”,参数);
}
公共类RegexIsMatch:HtmlXsltFunction
{
公共注册表匹配(HtmlXsltContex)
    public class HtmlXsltContext : XsltContext
    {
        public HtmlXsltContext()
            : base(new NameTable())
        {
        }

        public override int CompareDocument(string baseUri, string nextbaseUri)
        {
            throw new NotImplementedException();
        }

        public override bool PreserveWhitespace(XPathNavigator node)
        {
            throw new NotImplementedException();
        }

        protected virtual IXsltContextFunction CreateHtmlXsltFunction(string prefix, string name, XPathResultType[] ArgTypes)
        {
            return HtmlXsltFunction.GetBuiltIn(this, prefix, name, ArgTypes);
        }

        public override IXsltContextFunction ResolveFunction(string prefix, string name, XPathResultType[] ArgTypes)
        {
            return CreateHtmlXsltFunction(prefix, name, ArgTypes);
        }

        public override IXsltContextVariable ResolveVariable(string prefix, string name)
        {
            throw new NotImplementedException();
        }

        public override bool Whitespace
        {
            get { return true; }
        }
    }

    public abstract class HtmlXsltFunction : IXsltContextFunction
    {
        protected HtmlXsltFunction(HtmlXsltContext context, string prefix, string name, XPathResultType[] argTypes)
        {
            Context = context;
            Prefix = prefix;
            Name = name;
            ArgTypes = argTypes;
        }

        public HtmlXsltContext Context { get; private set; }
        public string Prefix { get; private set; }
        public string Name { get; private set; }
        public XPathResultType[] ArgTypes { get; private set; }

        public virtual int Maxargs
        {
            get { return Minargs; }
        }

        public virtual int Minargs
        {
            get { return 1; }
        }

        public virtual XPathResultType ReturnType
        {
            get { return XPathResultType.String; }
        }

        public abstract object Invoke(XsltContext xsltContext, object[] args, XPathNavigator docContext);

        public static IXsltContextFunction GetBuiltIn(HtmlXsltContext context, string prefix, string name, XPathResultType[] argTypes)
        {
            if (name == "regex-is-match")
                return new RegexIsMatch(context, name);

            // TODO: create other functions here
            return null;
        }

        public static string ConvertToString(object argument, bool outer, string separator)
        {
            if (argument == null)
                return null;

            string s = argument as string;
            if (s != null)
                return s;

            XPathNodeIterator it = argument as XPathNodeIterator;
            if (it != null)
            {
                if (!it.MoveNext())
                    return null;

                StringBuilder sb = new StringBuilder();
                do
                {
                    HtmlNodeNavigator n = it.Current as HtmlNodeNavigator;
                    if (n != null && n.CurrentNode != null)
                    {
                        if (sb.Length > 0 && separator != null)
                        {
                            sb.Append(separator);
                        }

                        sb.Append(outer ? n.CurrentNode.OuterHtml : n.CurrentNode.InnerHtml);
                    }
                }
                while (it.MoveNext());
                return sb.ToString();
            }

            IEnumerable enumerable = argument as IEnumerable;
            if (enumerable != null)
            {
                StringBuilder sb = null;
                foreach (object arg in enumerable)
                {
                    if (sb == null)
                    {
                        sb = new StringBuilder();
                    }

                    if (sb.Length > 0 && separator != null)
                    {
                        sb.Append(separator);
                    }

                    string s2 = ConvertToString(arg, outer, separator);
                    if (s2 != null)
                    {
                        sb.Append(s2);
                    }
                }
                return sb != null ? sb.ToString() : null;
            }

            return string.Format("{0}", argument);
        }

        public class RegexIsMatch : HtmlXsltFunction
        {
            public RegexIsMatch(HtmlXsltContext context, string name)
                : base(context, null, name, null)
            {
            }

            public override XPathResultType ReturnType { get { return XPathResultType.Boolean; } }
            public override int Minargs { get { return 2; } }

            public override object Invoke(XsltContext xsltContext, object[] args, XPathNavigator docContext)
            {
                if (args.Length < 2)
                    return false;

                return Regex.IsMatch(ConvertToString(args[0], false, null), ConvertToString(args[1], false, null));
            }
        }
    }