如何自动检索到的URL AJAX调用?

如何自动检索到的URL AJAX调用?,ajax,web-crawler,scrapy,Ajax,Web Crawler,Scrapy,目的是规划一个爬行蜘蛛,它能够: 1) 检索此页面表中链接的URL: 2) 遵循所有这些URL的AJAX调用,找出包含我想要抓取的数据的最终(“AJAX”)URL 3) 刮取由AJAX URL标识的最终页面 到目前为止,我已经在Scrapy下写了两个蜘蛛: 1) 第一个从起始页上的链接检索URL。代码如下: from scrapy.spider import Spider from scrapy.selector import HtmlXPathSelector from co

目的是规划一个爬行蜘蛛,它能够:

1) 检索此页面表中链接的URL:

2) 遵循所有这些URL的AJAX调用,找出包含我想要抓取的数据的最终(“AJAX”)URL

3) 刮取由AJAX URL标识的最终页面

到目前为止,我已经在Scrapy下写了两个蜘蛛:

1) 第一个从起始页上的链接检索URL。代码如下:

   from scrapy.spider import Spider
   from scrapy.selector import HtmlXPathSelector
   from cordis.items import CordisItem

   class MySpider(Spider):
       name = "Cordis1"
       allowed_domains = ["cordis.europa.eu"]
       start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"]

       def parse(self, response):
           hxs = HtmlXPathSelector(response)
           titles = hxs.select("//p")
           items = []
           for titles in titles:
               item = CordisItem()
               item ["link"] = titles.select("//ul/li/span/a/@href").extract()
           return item
from scrapy.spider import Spider
from scrapy.selector import Selector

class EssaiSpider(Spider):
    name = "aze"
    allowed_domains = ["cordis.europa.eu"]
    start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607",
    "http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"]

    def parse(self, response):
        sel = Selector(response)
        acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract()
        short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract()
        start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract()
        end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract()
        long_desc = sel.xpath("//*[@class='tech']/p/text()").extract()
        cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract()
        contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract()
        type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract()
        sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract()
        coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract()
        coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract()
        part = sel.xpath("//*[@class='participants']")
        for part in part:
            part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract()
            part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract()
            part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract()
            part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract()
            part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract()
            part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract()
            part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract()
            part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()            
            part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract()
            part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()            
            part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract()
            part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()            
            part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract()
            part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()            
            part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract()
            part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()        
            part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract()
            part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()            
            part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract()
            part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()          
            part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
            part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()          
            part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
            part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()          
            part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
            part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()          
            part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
            part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()          
            part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract()
            part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()          
            part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract()
            part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()          
            part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract()
            part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()          
            part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract()
            part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()          
            part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract()
            part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()      
            part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract()
            part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()       
            part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract()
            part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()          
            part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract()
            part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()          
            part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract()
            part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()          
            part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract()
            part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract()  
            part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract()
            part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()          
            part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract()
            part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()          
            part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract()
            part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()          
            part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract()
            part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()          
            part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract()
            part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()          
            part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract()
            part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()          
            part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract()
            part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()          
            part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract()
            part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()      
            part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract()
            part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract()
            part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract()
            part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract()
            part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract()
            part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract()
            part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract()
            part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract()
            part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract()
            part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract()
            part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract()
            part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract()
            part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract()
            part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract()
            part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract()
            part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract()
            part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract()
            part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()      
        print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat
2) 第二个从“AJAX”URL中刮取数据。代码如下:

   from scrapy.spider import Spider
   from scrapy.selector import HtmlXPathSelector
   from cordis.items import CordisItem

   class MySpider(Spider):
       name = "Cordis1"
       allowed_domains = ["cordis.europa.eu"]
       start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"]

       def parse(self, response):
           hxs = HtmlXPathSelector(response)
           titles = hxs.select("//p")
           items = []
           for titles in titles:
               item = CordisItem()
               item ["link"] = titles.select("//ul/li/span/a/@href").extract()
           return item
from scrapy.spider import Spider
from scrapy.selector import Selector

class EssaiSpider(Spider):
    name = "aze"
    allowed_domains = ["cordis.europa.eu"]
    start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607",
    "http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"]

    def parse(self, response):
        sel = Selector(response)
        acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract()
        short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract()
        start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract()
        end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract()
        long_desc = sel.xpath("//*[@class='tech']/p/text()").extract()
        cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract()
        contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract()
        type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract()
        sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract()
        coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract()
        coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract()
        part = sel.xpath("//*[@class='participants']")
        for part in part:
            part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract()
            part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract()
            part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract()
            part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract()
            part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract()
            part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract()
            part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract()
            part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()            
            part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract()
            part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()            
            part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract()
            part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()            
            part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract()
            part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()            
            part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract()
            part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()        
            part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract()
            part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()            
            part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract()
            part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()          
            part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
            part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()          
            part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
            part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()          
            part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
            part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()          
            part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
            part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()          
            part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract()
            part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()          
            part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract()
            part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()          
            part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract()
            part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()          
            part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract()
            part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()          
            part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract()
            part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()      
            part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract()
            part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()       
            part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract()
            part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()          
            part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract()
            part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()          
            part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract()
            part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()          
            part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract()
            part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract()  
            part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract()
            part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()          
            part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract()
            part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()          
            part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract()
            part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()          
            part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract()
            part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()          
            part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract()
            part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()          
            part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract()
            part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()          
            part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract()
            part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()          
            part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract()
            part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()      
            part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract()
            part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract()
            part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract()
            part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract()
            part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract()
            part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract()
            part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract()
            part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract()
            part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract()
            part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract()
            part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract()
            part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract()
            part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract()
            part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract()
            part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract()
            part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract()
            part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract()
            part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()      
        print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat
由于缺少更好的术语,我可以手动检索我称之为“AJAX”的url,方法是使用Netbug为第一个Spider生成的每个url过滤XHR请求。然后,我只需要将这些“AJAX”url提供给第二个Spider

但是有可能自动检索那些“AJAX”URL吗


更一般地说,如何编写一个执行上述三个操作的爬网爬行器

是的,可以自动检索这些url,但您必须确定ajax从哪个url加载内容。这里有一个简单的教程

1。做你的研究

在chrome控制台中,若打开“网络”选项卡,并按xml请求过滤,则会得到“启动器”字段。右边是javascript文件,其中包含负责生成请求的代码。Chrome控制台显示调用请求的行

在您的情况下,最重要的代码是 在文件jquery-projects.js的第415行中,该行表示如下内容:

    $.ajax({
        async:      true,
        type:       'GET',
        url:        URL,
如您所见,这里有一个URL变量。您需要找到它的编码位置,就在上面几行:

    var URL = '/projects/index.cfm?fuseaction=app.csa'; // production

    switch(type) {
        ...
        case 'doc':
            URL += '&action=read&xslt-template=projects/xsl/projectdet_' + I18n.locale + '.xslt&rcn=' + me.ref;
            break;
    }
因此,url是通过添加基本url生成的,一些字符串以action开头,然后是两个变量I18n.locale和me.ref。请记住,此url是相对的,因此您还需要获取url根

I18n.locale原来只是一个字符串“_en”,me.ref来自哪里

再次在控制台的sources选项卡中按住ctrl+find键,您可以找到以下jQuery行:

    // record reference
    me.ref = $("#PrjSrch>input[name='REF']").val();
事实证明,每个url都有一个隐藏的表单,每次生成请求时,它都从这个me.ref字段中获取值

现在,您只需要将这些知识应用到您的零碎项目中

2。使用您在scrapy spider中的知识。

在这一点上,你知道你必须做什么。您需要从所有项目的起始url开始,获取所有链接,对这些链接发出请求,然后从每个请求后接收的内容中提取ajax url,并生成我们从中获得的url请求

from scrapy.selector import Selector
from scrapy.spider import Spider
from scrapy.http import Request
from eu.items import EuItem
from urlparse import urljoin


class CordisSpider(Spider):
    name = 'cordis'
    start_urls = ['http://cordis.europa.eu/fp7/security/projects_en.html']
    base_url = "http://cordis.europa.eu/projects/"
    # template string for ajax request based on what we know from investigating webpage
    base_ajax_url = "http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=%s"

    def parse(self, response):
        """
        Extract project links from start_url, for each generate GET request,
        and then assign a function self.get_ajax_content to handle response.
        """
        hxs = Selector(response)
        links = hxs.xpath("//ul/li/span/a/@href").extract()
        for link in links:
            link = urljoin(self.base_url,link)
            yield Request(url=link,callback=self.get_ajax_content)

    def get_ajax_content(self,response):
        """
        Extract AJAX link and make a GET request
        for the desired content, assign callback
        to handle response from this request.
        """
        hxs = Selector(response)
        # xpath analogy of jquery line we've seen
        ajax_ref = hxs.xpath('//form[@id="PrjSrch"]//input[@name="REF"]/@value').extract()
        ajax_ref = "".join(ajax_ref)
        ajax_url = self.base_ajax_url % (ajax_ref,)
        yield Request(url=ajax_url,callback=self.parse_items)

    def parse_items(self,response):
        """
        Response here should contain content
        normally loaded asynchronously with AJAX.
        """
        xhs = Selector(response)
        # you can do your processing here
        title = xhs.xpath("//div[@class='projttl']//text()").extract()
        i = EuItem()
        i["title"] = title
        return i  

这是一个后续问题,非常感谢您的详细回答。我会调查一下,尽快给你回复!