Scrapy csv输出的刮擦问题
这是我的蜘蛛:Scrapy csv输出的刮擦问题,scrapy,Scrapy,这是我的蜘蛛: from scrapy.contrib.spiders import CrawlSpider,Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from vrisko.items import VriskoItem from scrapy.http import
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[@class="detailsHyper_class"]/@href').extract()
ep = hxs.select('//a[@itemprop="name"]/text()').extract()
ad = hxs.select('//div[@class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[@class="category_class"]/span/text()').extract()
yield vriskoit
这是我的pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
我的问题是,对于前两个字段(eponimia,address),只有第一个字符写入输出csv文件,我找不到原因
如果有任何帮助,我将不胜感激。从
myExporter.process\u项目中删除zip
函数
def过程\项目(自身、项目、蜘蛛):
self.brandCategorySv.writerow([item['eponimia'].encode('utf-8'),
项目['address']编码('utf-8'),
项目['category'].编码('utf-8'))
退货项目
您已经将项目列表转换为vriskoSpider.parse_start_url
中的单个项目
zip
迭代字符串:
[1]中的a='test1'
在[2]中:b='test2'
在[3]中:对于拉链中的x,y(a,b):
…:打印x,y
...:
t t
e e
s s
t t
1 2
谢谢你的回答,当我这样做时,我得到一个错误:“太多的值无法解包”@mindcast,粘贴了错误的代码片段。尝试更新答案中的代码。嗯,我不能使用encode()。“'list'对象没有属性'encode'”@mindcast,看起来像,item['category']
是list。请尝试vriskoit['category']=hxs。选择('//div[@class=“category\u class”]/span/text())。提取()[0]