Python 使用scrapy提取链接内的数据
我一直在尝试从标题中的consumercomplaints.in以及这些标题链接中的数据中提取数据。我编写了以下代码,无法解析链接并提取数据,也无法提取所有相关链接。plz guidePython 使用scrapy提取链接内的数据,python,scrapy,Python,Scrapy,我一直在尝试从标题中的consumercomplaints.in以及这些标题链接中的数据中提取数据。我编写了以下代码,无法解析链接并提取数据,也无法提取所有相关链接。plz guide from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.selector import Selector from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor fr
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
from comp.items import CompItem
class criticspider(CrawlSpider):
name ="comp"
allowed_domains =["consumercomplaints.in"]
#start_urls =["http://www.consumercomplaints.in/?search=delhivery&page=2","http://www.consumercomplaints.in/?search=delhivery&page=3","http://www.consumercomplaints.in/?search=delhivery&page=4","http://www.consumercomplaints.in/?search=delhivery&page=5","http://www.consumercomplaints.in/?search=delhivery&page=6","http://www.consumercomplaints.in/?search=delhivery&page=7","http://www.consumercomplaints.in/?search=delhivery&page=8","http://www.consumercomplaints.in/?search=delhivery&page=9","http://www.consumercomplaints.in/?search=delhivery&page=10","http://www.consumercomplaints.in/?search=delhivery&page=11"]
start_urls=["http://www.consumercomplaints.in/?search=delhivery"]
rules=(
Rule(SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)), callback="parse", follow=True),
#Rule(SgmlLinkExtractor(allow=("startrow=\d",)),callback="parse_health",follow=True),
)
def parse(self,response):
hxs = Selector(response)
sites = hxs.select('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.select('.//td[@class="complaint"]/a/span/text()').extract()
item['link'] = site.select('.//td[@class="complaint"]/a/@href').extract()
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
# item['intro'] = site.select('.//td[@class="small"]//a[2]/text()').extract()
# item['heading'] = site.select('.//td[@class="compl-text"]/div/b[1]/text()').extract()
# item['date'] = site.select('.//td[@class="small"]/text()[2]').extract()
# item['complaint'] = site.select('.//td[@class="compl-text"]/div/text()').extract()
items.append(item)
def anchor_page(self, response):
hxs = Selector(response)
old_item = response.request.meta['item'] # Receiving parse Method item that was in Request meta
# parse some more values
#place them in old_item
#e.g
old_item['data']=hxs.select('.//td[@class="compl-text"]/div/text()').extract()
yield old_item
你用的是旧版本的刮痧吗 在最新的稳定版本中,您不需要执行
hxs=Selector(response)
或使用hxs.select()
方法。您只需使用response.xpath()
就可以做同样的事情
我认为代码中的问题在于select()
(或response.xpath
)的结果实际上是一个Python列表,因此您需要执行以下操作:
link = site.select('.//td[@class="complaint"]/a/@href').extract()
if link:
item['link'] = link[0]
你可能也想为标题做类似的事情
编辑:我做了一些改动:
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
data = scrapy.Field()
class criticspider(CrawlSpider):
name = "comp"
allowed_domains = ["consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=delhivery"]
rules = (
Rule(
SgmlLinkExtractor(allow=("search=delhivery&page=1/+",)),
callback="parse",
follow=True),
)
def parse(self, response):
sites = response.xpath('//table[@width="100%"]')
items = []
for site in sites:
item = CompItem()
item['title'] = site.xpath('.//td[@class="complaint"]/a/span/text()').extract()[0]
item['link'] = site.xpath('.//td[@class="complaint"]/a/@href').extract()[0]
if item['link']:
if 'http://' not in item['link']:
item['link'] = urljoin(response.url, item['link'])
yield scrapy.Request(item['link'],
meta={'item': item},
callback=self.anchor_page)
items.append(item)
def anchor_page(self, response):
old_item = response.request.meta['item']
old_item['data'] = response.xpath('.//td[@class="compl-text"]/div/text()').extract()
yield old_item
无法获取url数据still@NikhilParmar我编辑了你的示例并发布了一个工作版本,你知道extract()[0]是什么吗?@NikhilParmarextract()
返回一个列表,[0]
部分得到了它的第一个元素——我也试图在回答中解释这一点。如果我的回答有用,请将其标记为已接受(投票箭头下的V按钮)。嘿,你能帮我回答这个问题吗-