如何自动检索到的URL AJAX调用?
目的是规划一个爬行蜘蛛,它能够: 1) 检索此页面表中链接的URL: 2) 遵循所有这些URL的AJAX调用,找出包含我想要抓取的数据的最终(“AJAX”)URL 3) 刮取由AJAX URL标识的最终页面 到目前为止,我已经在Scrapy下写了两个蜘蛛: 1) 第一个从起始页上的链接检索URL。代码如下:如何自动检索到的URL AJAX调用?,ajax,web-crawler,scrapy,Ajax,Web Crawler,Scrapy,目的是规划一个爬行蜘蛛,它能够: 1) 检索此页面表中链接的URL: 2) 遵循所有这些URL的AJAX调用,找出包含我想要抓取的数据的最终(“AJAX”)URL 3) 刮取由AJAX URL标识的最终页面 到目前为止,我已经在Scrapy下写了两个蜘蛛: 1) 第一个从起始页上的链接检索URL。代码如下: from scrapy.spider import Spider from scrapy.selector import HtmlXPathSelector from co
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from cordis.items import CordisItem
class MySpider(Spider):
name = "Cordis1"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CordisItem()
item ["link"] = titles.select("//ul/li/span/a/@href").extract()
return item
from scrapy.spider import Spider
from scrapy.selector import Selector
class EssaiSpider(Spider):
name = "aze"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607",
"http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"]
def parse(self, response):
sel = Selector(response)
acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract()
short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract()
start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract()
end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract()
long_desc = sel.xpath("//*[@class='tech']/p/text()").extract()
cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract()
contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract()
type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract()
sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract()
coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract()
coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract()
part = sel.xpath("//*[@class='participants']")
for part in part:
part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract()
part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract()
part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract()
part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract()
part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract()
part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract()
part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()
part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract()
part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()
part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract()
part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()
part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract()
part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()
part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract()
part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()
part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract()
part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()
part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract()
part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()
part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract()
part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()
part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract()
part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()
part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract()
part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()
part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract()
part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()
part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract()
part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()
part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()
part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract()
part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()
part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract()
part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()
part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract()
part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()
part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract()
part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract()
part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract()
part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()
part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract()
part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()
part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract()
part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()
part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract()
part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()
part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract()
part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()
part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract()
part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()
part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract()
part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()
part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract()
part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()
part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract()
part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract()
part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract()
part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract()
part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract()
part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract()
part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract()
part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract()
part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract()
part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract()
part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract()
part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract()
part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract()
part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract()
part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract()
part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract()
part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract()
part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()
print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat
2) 第二个从“AJAX”URL中刮取数据。代码如下:
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from cordis.items import CordisItem
class MySpider(Spider):
name = "Cordis1"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CordisItem()
item ["link"] = titles.select("//ul/li/span/a/@href").extract()
return item
from scrapy.spider import Spider
from scrapy.selector import Selector
class EssaiSpider(Spider):
name = "aze"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607",
"http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"]
def parse(self, response):
sel = Selector(response)
acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract()
short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract()
start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract()
end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract()
long_desc = sel.xpath("//*[@class='tech']/p/text()").extract()
cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract()
contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract()
type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract()
sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract()
coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract()
coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract()
part = sel.xpath("//*[@class='participants']")
for part in part:
part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract()
part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract()
part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract()
part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract()
part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract()
part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract()
part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()
part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract()
part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()
part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract()
part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()
part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract()
part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()
part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract()
part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()
part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract()
part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()
part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract()
part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()
part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract()
part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()
part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract()
part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()
part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract()
part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()
part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract()
part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()
part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract()
part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()
part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()
part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract()
part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()
part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract()
part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()
part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract()
part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()
part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract()
part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract()
part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract()
part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()
part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract()
part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()
part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract()
part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()
part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract()
part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()
part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract()
part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()
part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract()
part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()
part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract()
part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()
part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract()
part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()
part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract()
part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract()
part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract()
part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract()
part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract()
part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract()
part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract()
part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract()
part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract()
part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract()
part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract()
part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract()
part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract()
part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract()
part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract()
part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract()
part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract()
part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()
print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat
由于缺少更好的术语,我可以手动检索我称之为“AJAX”的url,方法是使用Netbug为第一个Spider生成的每个url过滤XHR请求。然后,我只需要将这些“AJAX”url提供给第二个Spider
但是有可能自动检索那些“AJAX”URL吗
更一般地说,如何编写一个执行上述三个操作的爬网爬行器 是的,可以自动检索这些url,但您必须确定ajax从哪个url加载内容。这里有一个简单的教程 1。做你的研究 在chrome控制台中,若打开“网络”选项卡,并按xml请求过滤,则会得到“启动器”字段。右边是javascript文件,其中包含负责生成请求的代码。Chrome控制台显示调用请求的行 在您的情况下,最重要的代码是 在文件jquery-projects.js的第415行中,该行表示如下内容:
$.ajax({
async: true,
type: 'GET',
url: URL,
如您所见,这里有一个URL变量。您需要找到它的编码位置,就在上面几行:
var URL = '/projects/index.cfm?fuseaction=app.csa'; // production
switch(type) {
...
case 'doc':
URL += '&action=read&xslt-template=projects/xsl/projectdet_' + I18n.locale + '.xslt&rcn=' + me.ref;
break;
}
因此,url是通过添加基本url生成的,一些字符串以action开头,然后是两个变量I18n.locale和me.ref。请记住,此url是相对的,因此您还需要获取url根
I18n.locale原来只是一个字符串“_en”,me.ref来自哪里
再次在控制台的sources选项卡中按住ctrl+find键,您可以找到以下jQuery行:
// record reference
me.ref = $("#PrjSrch>input[name='REF']").val();
事实证明,每个url都有一个隐藏的表单,每次生成请求时,它都从这个me.ref字段中获取值
现在,您只需要将这些知识应用到您的零碎项目中
2。使用您在scrapy spider中的知识。
在这一点上,你知道你必须做什么。您需要从所有项目的起始url开始,获取所有链接,对这些链接发出请求,然后从每个请求后接收的内容中提取ajax url,并生成我们从中获得的url请求
from scrapy.selector import Selector
from scrapy.spider import Spider
from scrapy.http import Request
from eu.items import EuItem
from urlparse import urljoin
class CordisSpider(Spider):
name = 'cordis'
start_urls = ['http://cordis.europa.eu/fp7/security/projects_en.html']
base_url = "http://cordis.europa.eu/projects/"
# template string for ajax request based on what we know from investigating webpage
base_ajax_url = "http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=%s"
def parse(self, response):
"""
Extract project links from start_url, for each generate GET request,
and then assign a function self.get_ajax_content to handle response.
"""
hxs = Selector(response)
links = hxs.xpath("//ul/li/span/a/@href").extract()
for link in links:
link = urljoin(self.base_url,link)
yield Request(url=link,callback=self.get_ajax_content)
def get_ajax_content(self,response):
"""
Extract AJAX link and make a GET request
for the desired content, assign callback
to handle response from this request.
"""
hxs = Selector(response)
# xpath analogy of jquery line we've seen
ajax_ref = hxs.xpath('//form[@id="PrjSrch"]//input[@name="REF"]/@value').extract()
ajax_ref = "".join(ajax_ref)
ajax_url = self.base_ajax_url % (ajax_ref,)
yield Request(url=ajax_url,callback=self.parse_items)
def parse_items(self,response):
"""
Response here should contain content
normally loaded asynchronously with AJAX.
"""
xhs = Selector(response)
# you can do your processing here
title = xhs.xpath("//div[@class='projttl']//text()").extract()
i = EuItem()
i["title"] = title
return i
这是一个后续问题,非常感谢您的详细回答。我会调查一下,尽快给你回复!