C# Abot爬虫忽略爬虫页面HttpWebRequest/Response_C#_Web Crawler_Httprequest_Httpresponse_Cefsharp

C# Abot爬虫忽略爬虫页面HttpWebRequest/Response

c# web-crawler

C# Abot爬虫忽略爬虫页面HttpWebRequest/Response,c#,web-crawler,httprequest,httpresponse,cefsharp,C#,Web Crawler,Httprequest,Httpresponse,Cefsharp,我使用Abot的方式是，我有一个WPF应用程序，它显示一个浏览器控件（CefSharp）。用户登录，站点使用的任何可能的自定义身份验证都将在爬行时工作，就像用户实际浏览站点一样因此，当我爬网时，我想使用这个浏览器控件发出请求并简单地返回页面数据。因此，我实现了我的定制页面请求程序，完整清单如下问题在于，与其他浏览器控件一样，CefSharp无法获取与爬网页面相关联的HttpWebRequest/Response。如果不设置这两个属性，Abot将不会继续爬网我能做些什么来回避这个问题吗

我使用Abot的方式是，我有一个WPF应用程序，它显示一个浏览器控件（CefSharp）。用户登录，站点使用的任何可能的自定义身份验证都将在爬行时工作，就像用户实际浏览站点一样

因此，当我爬网时，我想使用这个浏览器控件发出请求并简单地返回页面数据。因此，我实现了我的定制页面请求程序，完整清单如下

问题在于，与其他浏览器控件一样，CefSharp无法获取与爬网页面相关联的HttpWebRequest/Response。如果不设置这两个属性，Abot将不会继续爬网

我能做些什么来回避这个问题吗

代码列表：

using Abot.Core;
using Abot.Poco;
using CefSharp.Wpf;
using System;
using System.Net;
using System.Text;
using System.Threading;

public class CefPageRequester : IPageRequester
{
    private MainWindowDataContext DataContext;
    private ChromiumWebBrowser ChromiumWebBrowser;
    private CrawlConfiguration CrawlConfig;

    private volatile bool _navigationCompleted;
    private string _pageSource;

    public CefPageRequester(MainWindowDataContext dataContext, ChromiumWebBrowser chromiumWebBrowser, CrawlConfiguration crawlConfig)
    {
        this.DataContext = dataContext;
        this.ChromiumWebBrowser = chromiumWebBrowser;
        this.CrawlConfig = crawlConfig;

        this.ChromiumWebBrowser.FrameLoadEnd += ChromiumWebBrowser_FrameLoadEnd;
    }

    public CrawledPage MakeRequest(Uri uri)
    {
        return this.MakeRequest(uri, cp => new CrawlDecision() { Allow = true });
    }

    public CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent)
    {
        if (uri == null)
            throw new ArgumentNullException("uri");

        CrawledPage crawledPage = new CrawledPage(uri);

        try
        {
            //the browser control is bound to the address of the data context, 
            //if we set the address directly it breaks for some reason, although it's a two way binding.
            this.DataContext.Address = uri.AbsolutePath;

            crawledPage.RequestStarted = DateTime.Now;
            crawledPage.DownloadContentStarted = crawledPage.RequestStarted;

            while (!_navigationCompleted)
                Thread.CurrentThread.Join(10);
        }
        catch (WebException e)
        {
            crawledPage.WebException = e;
        }
        catch
        {
            //bad luck, we should log this.
        }
        finally
        {
            //TODO must add these properties!!
            //crawledPage.HttpWebRequest = request;
            //crawledPage.HttpWebResponse = response;
            crawledPage.RequestCompleted = DateTime.Now;
            crawledPage.DownloadContentCompleted = crawledPage.RequestCompleted;
            if (!String.IsNullOrWhiteSpace(_pageSource))
                crawledPage.Content = this.GetContent("UTF-8", _pageSource);

            _navigationCompleted = false;
            _pageSource = null;
        }

        return crawledPage;
    }

    private void ChromiumWebBrowser_FrameLoadEnd(object sender, CefSharp.FrameLoadEndEventArgs e)
    {
        if (!e.IsMainFrame)
            return;

        this.ChromiumWebBrowser.Dispatcher.BeginInvoke(
            (Action)(() =>
            {
                _pageSource = this.ChromiumWebBrowser.GetSourceAsync().Result;
                _navigationCompleted = true;
            }));
    }

    private PageContent GetContent(string charset, string html)
    {
        PageContent pageContent = new PageContent();
        pageContent.Charset = charset;
        pageContent.Encoding = this.GetEncoding(charset);
        pageContent.Text = html;
        pageContent.Bytes = pageContent.Encoding.GetBytes(html);

        return pageContent;
    }

    private Encoding GetEncoding(string charset)
    {
        Encoding e = Encoding.UTF8;
        if (charset != null)
        {
            try
            {
                e = Encoding.GetEncoding(charset);
            }
            catch { }
        }

        return e;
    }
}

使用Abot.Core；
使用Abot.Poco；
使用CefSharp.Wpf；
使用制度；
Net系统；
使用系统文本；
使用系统线程；
公共类请求者：IPageRequester
{
私有MainWindowDataContext DataContext；
专用ChromiumWebBrowser ChromiumWebBrowser；
私有爬网配置爬网配置；
私有易失性bool_导航完成；
私有字符串pageSource；
公共页面请求程序（MainWindowDataContext dataContext、ChromiumWebBrowser ChromiumWebBrowser、爬行配置爬行配置）
{
this.DataContext=DataContext；
this.ChromiumWebBrowser=ChromiumWebBrowser；
this.CrawlConfig=CrawlConfig；
this.ChromiumWebBrowser.FrameLoadEnd+=ChromiumWebBrowser\u FrameLoadEnd；
}
公共爬网页面生成请求（Uri）
{
返回这个.MakeRequest（uri，cp=>newcrawldecision（）{Allow=true}）；
}
公共爬网页面生成请求（Uri，Func shouldldownloadcontent）
{
if（uri==null）
抛出新的ArgumentNullException（“uri”）；
CrawledPage CrawledPage=新的CrawledPage（uri）；
尝试
{
//浏览器控件绑定到数据上下文的地址，
//如果我们直接设置地址，它会因某种原因中断，尽管它是双向绑定。
this.DataContext.Address=uri.AbsolutePath；
crawledPage.requeststart=DateTime.Now；
crawledPage.DownloadContentStarted=crawledPage.RequestStarted；
而（！\u导航已完成）
Thread.CurrentThread.Join（10）；
}
捕获（WebE例外）
{
crawledPage.WebException=e；
}
抓住
{
//运气不好，我们应该记录下来。
}
最后
{
//TODO必须添加这些属性！！
//crawledPage.HttpWebRequest=请求；
//crawledPage.HttpWebResponse=响应；
crawledPage.RequestCompleted=DateTime.Now；
crawledPage.DownloadContentCompleted=crawledPage.RequestCompleted；
如果（！String.IsNullOrWhiteSpace（_pageSource））
crawledPage.Content=this.GetContent（“UTF-8”，_pageSource）；
_navigationCompleted=false；
_pageSource=null；
}
返回爬虫页面；
}
私有void ChromiumWebBrowser_FrameLoadEnd（对象发送方，CefSharp.FrameLoadEndEventArgs e）
{
如果（！e.IsMainFrame）
返回；
this.ChromiumWebBrowser.Dispatcher.BeginInvoke(
（行动）（（）=>
{
_pageSource=this.ChromiumWebBrowser.GetSourceAsync（）.Result；
_navigationCompleted=true；
}));
}
私有页面内容GetContent（字符串字符集、字符串html）
{
PageContent PageContent=新的PageContent（）；
pageContent.Charset=Charset；
pageContent.Encoding=this.GetEncoding（字符集）；
Text=html；
pageContent.Bytes=pageContent.Encoding.GetBytes（html）；
返回页面内容；
}
私有编码GetEncoding（字符串字符集）
{
编码e=Encoding.UTF8；
if（字符集！=null）
{
尝试
{
e=Encoding.GetEncoding（字符集）；
}
捕获{}
}
返回e；
}
}

这个问题也可以表述为：如何避免从流创建HttpWebResponse？鉴于MSDN，这似乎是不可能的：

您不应该直接创建HttpWebResponse的实例班级。而是使用调用返回的实例 HttpWebRequest.GetResponse

我必须实际发布请求以获得响应，这正是我希望通过使用web浏览器控件来避免的。

正如您所知，许多功能取决于设置的HttpWebRequest和HttpWebResponse。我已经为你定了几个选项

1）重构Abot以使用一些POCO抽象而不是那些类。然后有一个转换器将真实的HttpWebRequest和HttpWebResponse转换为这些POCO类型，还有一个转换器将浏览器对象响应转换为这些POCO类型

2）创建从.net类继承的CustomHttpWebRequest和CustomHttpWebResponse，以便您可以访问/覆盖公共/受保护属性，从而允许您手动创建一个实例，对浏览器组件返回给您的请求/响应进行建模。我知道这可能很棘手，但可能会奏效（我从来没有这样做过，所以我不能肯定）

3） [我讨厌这个想法。这应该是你最后的选择]创建这些类的真实实例，并使用反射设置需要设置的任何属性/值，以满足Abot的所有用法

4） [我更讨厌这个想法]使用MS Fakes创建HttpWebRequest和HttpWebResponse的属性和方法的垫片/存根/赝品。然后您可以将其配置为返回您的值。这个工具通常只用于测试，但我相信它可以是u